In [32]:
!pip install mistralai langchain langchain_community openai





In [76]:
from mistralai import Mistral, DocumentURLChunk, ImageURLChunk, TextChunk
from google.colab import userdata
import json


api_key = userdata.get('MISTRAL_API_KEY')

client = Mistral(api_key=api_key)

uploaded_pdf = client.files.upload(
    file={
        "file_name": "kering2024.pdf",
        "content": open("kering2024.pdf", "rb"),
    },
    purpose="ocr"
)

signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id, expiry=1)

pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)

current_data_response = json.loads(pdf_response.json())

<ipython-input-76-f1e72609ae97>:22: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  current_data_response = json.loads(pdf_response.json())


In [77]:
uploaded_pdf = client.files.upload(
    file={
        "file_name": "kering2023.pdf",
        "content": open("kering2023.pdf", "rb"),
    },
    purpose="ocr"
)

signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id, expiry=1)

pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url), model="mistral-ocr-latest", include_image_base64=True)

old_data_response = json.loads(pdf_response.json())

<ipython-input-77-a6519061c0f4>:13: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  old_data_response = json.loads(pdf_response.json())


In [139]:
def clean_relevant_ocr_response(ocr_response):

    keywords = [
        "total revenue",
        "cost of goods sold",
        "gross profit",
        "operating income",
        "net income",
        "total assets",
        "current assets",
        "current liabilities",
        "total liabilities",
        "shareholders' equity",
        "operating cash flow",
        "shares outstanding"
    ]

    cleaned_pages = []

    for page in ocr_response.get("pages", []):
        text = page.get("markdown", "").lower()  # Lowercase for case-insensitive comparison

        # Check if any of the relevant keywords appear in the page text
        if any(keyword in text for keyword in keywords):
            cleaned_pages.append({"index": page.get("index"), "markdown": page.get("markdown")})

    return {"pages": cleaned_pages}


In [145]:
import json
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

# Convert the OCR response to a JSON string for the prompt
data2024 = clean_relevant_ocr_response(current_data_response)
data2023 = clean_relevant_ocr_response(old_data_response)



prompt_template = PromptTemplate(
    input_variables=["ocr_text"],
    template="""
Extract the following financial metrics from the OCR text:

- Total Revenue
- Cost of Goods Sold
- Gross Profit (if missing, compute as Revenue - COGS)
- Operating Income
- Net Income
- Total Assets
- Current Assets
- Current Liabilities
- Total Liabilities
- Shareholders' Equity
- Operating Cash Flow
- Shares Outstanding (optional)

For each metric, do the following:
1. Extract the numeric value (ignore any currency symbols like €, $, commas, or extra spaces).
2. Identify the unit of measure when available (e.g., millions, billions, thousands).
3. Specify the page number where the metric appears.
4. If you cannot find the exact data, put None as value and page as -1.

For example, if the Total Revenue is on page 1 and the value is "5,000 million", you should return:

{{
  "Total Revenue": {{"value": 5000000000, "page": 1}},
  "Cost of Goods Sold": {{"value": 2000000000, "page": 1}},
  "Gross Profit": {{"value": 3000000000, "page": 1}},
  "Operating Income": {{"value": 2500000000,  "page": 1}},
  "Net Income": {{"value": 2200000000,  "page": 1}},
  "Total Assets": {{"value": 10000000000, "page": 1}},
  "Current Assets": {{"value": 5000000000, "page": 1}},
  "Current Liabilities": {{"value": 1000000000,"page": 1}},
  "Total Liabilities": {{"value": 4000000000, "page": 1}},
  "Shareholders' Equity": {{"value": 6000000000, "page": 1}},
  "Operating Cash Flow": {{"value": 1800000000,"page": 1}},
  "Shares Outstanding": {{"value": 500000000, "page": 1}}
}}

If any value is missing from the document, return:

{{
  "Total Revenue": {{"value": None, "unit": None, "page": -1}},
  "Cost of Goods Sold": {{"value": None, "unit": None, "page": -1}},
  ...
}}

- Ensure all numbers are extracted without currency symbols, commas, or spaces.
- Ensure the page numbers are correct.
- Use the correct unit (e.g., millions, billions, shares).
- If Gross Profit is not explicitly mentioned, compute it as (Total Revenue - Cost of Goods Sold).

{ocr_text}
"""
)



openai_api_key = userdata.get('OPEN_AI')
# Initialize ChatOpenAI (replacing OpenAI)
llm = ChatOpenAI(
    model_name="gpt-4o-mini",  # Change to "gpt-3.5-turbo" if needed
    temperature=0,
    api_key=openai_api_key
)

# Create the LangChain chain
chain = LLMChain(llm=llm, prompt=prompt_template)

# Run the chain with the OCR text
current_result = chain.run({"ocr_text": data2024})
old_result = chain.run({"ocr_text": data2023})


In [143]:
def compute_all_metrics(current_data, previous_data=None):
    """
    Compute derived financial metrics from the current extracted data.
    Optionally, if previous_data is provided (same format as current_data),
    also compute the Piotroski F-score.

    The expected keys in current_data (and previous_data) include:
      - "Total Revenue"
      - "Cost of Goods Sold"
      - "Gross Profit" (optional; if missing, computed as Revenue - COGS)
      - "Operating Income"
      - "Net Income"
      - "Total Assets"
      - "Current Assets"
      - "Current Liabilities"
      - "Total Liabilities"
      - "Shareholders' Equity"
      - "Operating Cash Flow" (for Piotroski scoring)
      - "Shares Outstanding" (optional, for Piotroski scoring)

    Returns a dictionary containing both the derived metrics and, if available,
    the Piotroski F-score.
    """

    def get_value(data, key):
        # Returns the numeric value for the given key or None if missing.
        item = data.get(key, {})
        return item.get("value")

    # Extract current values:
    revenue       = get_value(current_data, "Total Revenue")
    cogs          = get_value(current_data, "Cost of Goods Sold")
    gross_profit  = get_value(current_data, "Gross Profit")
    op_income     = get_value(current_data, "Operating Income")
    net_income    = get_value(current_data, "Net Income")
    total_assets  = get_value(current_data, "Total Assets")
    current_assets = get_value(current_data, "Current Assets")
    current_liabs = get_value(current_data, "Current Liabilities")
    total_liabs   = get_value(current_data, "Total Liabilities")
    equity        = get_value(current_data, "Shareholders' Equity")
    op_cash_flow  = get_value(current_data, "Operating Cash Flow")
    shares_out    = get_value(current_data, "Shares Outstanding")

    derived = {}

    # If Gross Profit is missing, compute it:
    if gross_profit is None and revenue is not None and cogs is not None:
        gross_profit = revenue - cogs
        derived["Computed Gross Profit"] = gross_profit

    # Profitability ratios:
    if revenue and gross_profit is not None:
        derived["Gross Profit Margin (%)"] = (gross_profit / revenue) * 100
    if revenue and op_income is not None:
        derived["Operating Profit Margin (%)"] = (op_income / revenue) * 100
    if revenue and net_income is not None:
        derived["Net Profit Margin (%)"] = (net_income / revenue) * 100
    if total_assets and net_income is not None:
        roa = net_income / total_assets
        derived["Return on Assets (ROA %)"] = roa * 100
    else:
        roa = None
    if equity and net_income is not None:
        roe = net_income / equity
        derived["Return on Equity (ROE %)"] = roe * 100
    else:
        roe = None

    # Leverage ratios:
    if equity and total_liabs is not None:
        derived["Debt-to-Equity Ratio"] = total_liabs / equity

    # Liquidity ratios:
    if current_assets and current_liabs:
        derived["Current Ratio"] = current_assets / current_liabs
        derived["Working Capital"] = current_assets - current_liabs

    # Efficiency ratio:
    if total_assets and revenue:
        derived["Asset Turnover Ratio"] = revenue / total_assets

    # If previous data is provided, we can compute changes for the Piotroski F-score.
    piotroski_score = None
    if previous_data is not None:
        piotroski_score = 0  # initialize F-score counter

        # Helper: compute ratio from data dictionary
        def compute_ratio(data, numerator_key, denominator_key):
            num = get_value(data, numerator_key)
            den = get_value(data, denominator_key)
            return num / den if num is not None and den else None

        # Current and previous ROA:
        prev_roa = None
        if total_assets and net_income is not None:
            curr_roa = net_income / total_assets
        else:
            curr_roa = None
        if get_value(previous_data, "Total Assets") and get_value(previous_data, "Net Income") is not None:
            prev_roa = get_value(previous_data, "Net Income") / get_value(previous_data, "Total Assets")

        # 1. ROA > 0
        score = 1 if curr_roa is not None and curr_roa > 0 else 0
        piotroski_score += score
        # 2. Operating Cash Flow > 0
        score = 1 if op_cash_flow is not None and op_cash_flow > 0 else 0
        piotroski_score += score
        # 3. Change in ROA: current ROA > previous ROA
        if prev_roa is not None and curr_roa is not None:
            score = 1 if curr_roa > prev_roa else 0
        else:
            score = 0
        piotroski_score += score
        # 4. Accruals: Operating Cash Flow > Net Income (i.e., lower accruals)
        score = 1 if op_cash_flow is not None and net_income is not None and op_cash_flow > net_income else 0
        piotroski_score += score

        # 5. Change in Leverage: current (Total Liabs/Total Assets) < previous
        curr_leverage = (total_liabs / total_assets) if total_liabs is not None and total_assets else None
        prev_total_liabs = get_value(previous_data, "Total Liabilities")
        prev_total_assets = get_value(previous_data, "Total Assets")
        prev_leverage = (prev_total_liabs / prev_total_assets) if prev_total_liabs and prev_total_assets else None
        if prev_leverage is not None and curr_leverage is not None:
            score = 1 if curr_leverage < prev_leverage else 0
        else:
            score = 0
        piotroski_score += score

        # 6. Change in Current Ratio: current (Current Assets/Current Liabilities) > previous
        curr_current_ratio = (current_assets / current_liabs) if current_assets and current_liabs else None
        prev_current_assets = get_value(previous_data, "Current Assets")
        prev_current_liabs = get_value(previous_data, "Current Liabilities")
        prev_current_ratio = (prev_current_assets / prev_current_liabs) if prev_current_assets and prev_current_liabs else None
        if prev_current_ratio is not None and curr_current_ratio is not None:
            score = 1 if curr_current_ratio > prev_current_ratio else 0
        else:
            score = 0
        piotroski_score += score

        # 7. Change in Shares Outstanding: if no new shares issued (i.e., current <= previous)
        if shares_out is not None:
            prev_shares = get_value(previous_data, "Shares Outstanding")
            if prev_shares is not None:
                score = 1 if shares_out <= prev_shares else 0
            else:
                score = 0
        else:
            score = 0
        piotroski_score += score

        # 8. Change in Gross Margin: (Gross Profit / Revenue) increased
        curr_gm = (gross_profit / revenue) if gross_profit is not None and revenue else None
        prev_gross_profit = get_value(previous_data, "Gross Profit")
        prev_revenue = get_value(previous_data, "Total Revenue")
        # If previous gross profit is not provided, try to compute it:
        if prev_gross_profit is None and prev_revenue is not None and get_value(previous_data, "Cost of Goods Sold") is not None:
            prev_gross_profit = get_value(previous_data, "Total Revenue") - get_value(previous_data, "Cost of Goods Sold")
        prev_gm = (prev_gross_profit / prev_revenue) if prev_gross_profit is not None and prev_revenue else None
        if prev_gm is not None and curr_gm is not None:
            score = 1 if curr_gm > prev_gm else 0
        else:
            score = 0
        piotroski_score += score

        # 9. Change in Asset Turnover: (Revenue / Total Assets) increased
        curr_at = (revenue / total_assets) if revenue and total_assets else None
        prev_revenue = get_value(previous_data, "Total Revenue")
        prev_total_assets = get_value(previous_data, "Total Assets")
        prev_at = (prev_revenue / prev_total_assets) if prev_revenue and prev_total_assets else None
        if prev_at is not None and curr_at is not None:
            score = 1 if curr_at > prev_at else 0
        else:
            score = 0
        piotroski_score += score

        derived["Piotroski F-score"] = piotroski_score

    derived["Piotroski F-score"] = piotroski_score if piotroski_score is not None else "N/A (previous data missing)"

    return derived

In [160]:
derived = compute_all_metrics(json.loads(current_result[7:-3]),json.loads(old_result[7:-3]))

In [161]:
derived

{'Gross Profit Margin (%)': 74.33407002442712,
 'Operating Profit Margin (%)': 13.446551122484587,
 'Net Profit Margin (%)': 6.589507967895777,
 'Return on Assets (ROA %)': 2.613851335763392,
 'Return on Equity (ROE %)': 7.60198604401503,
 'Debt-to-Equity Ratio': 1.209339774557166,
 'Current Ratio': 1.1074854045037532,
 'Working Capital': 1031000000,
 'Asset Turnover Ratio': 0.3966686660822221,
 'Piotroski F-score': 4}