In [14]:
from __future__ import annotations

import re
from typing import Tuple, Optional

import requests


DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0 Safari/537.36"
    )
}


def fetch_page(url: str, timeout: float = 15.0) -> Tuple[Optional[str], str]:
    """Fetches a URL and returns (title, text). Best-effort, no external services.

    - Uses requests with a friendly UA and timeout.
    - Tries BeautifulSoup if available for better parsing; otherwise falls back to regex strip.
    """
    resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
    resp.raise_for_status()
    html = resp.text

    title = None
    text_content = None

    try:
        from bs4 import BeautifulSoup  # type: ignore

        soup = BeautifulSoup(html, "html.parser")
        title = soup.title.get_text(strip=True) if soup.title else None
        # Remove script/style
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        text_content = soup.get_text(" ", strip=True)
    except Exception:
        # Fallback: naive tag removal
        title_match = re.search(r"<title>(.*?)</title>", html, flags=re.I | re.S)
        if title_match:
            title = re.sub(r"\s+", " ", title_match.group(1)).strip()
        # Remove scripts/styles
        html = re.sub(r"<script[\s\S]*?</script>", " ", html, flags=re.I)
        html = re.sub(r"<style[\s\S]*?</style>", " ", html, flags=re.I)
        # Strip tags
        text_only = re.sub(r"<[^>]+>", " ", html)
        text_content = re.sub(r"\s+", " ", text_only).strip()

    return title, text_content or ""



In [15]:
url = 'https://www.cnbc.com/2025/10/31/cre-companies-ai-goals.html'
fetch_page(url)

(None,
 'Few CRE companies have achieved their AI goals. Here&#x27;s why Skip Navigation Markets Pre-Markets U.S. Markets Europe Markets China Markets Asia Markets World Markets Currencies Cryptocurrency Futures &amp; Commodities Bonds Funds &amp; ETFs Business Economy Finance Health &amp; Science Media Real Estate Energy Climate Transportation Industrials Retail Wealth Sports Life Small Business Investing Personal Finance Fintech Financial Advisors Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity AI Enterprise Internet Media Mobile Social Media CNBC Disruptor 50 Tech Guide Politics White House Policy Defense Congress Expanding Opportunity Europe Politics China Politics Asia Politics World Politics Video Latest Video Full Episodes Livestream Top Video Live Audio Europe TV Asia TV CNBC Podcasts CEO Interviews Digital Originals Watchlist Investing Club Trust Portfolio Analysis Trade Alerts Meeting Videos Homestretch Jim&#x27;s Columns Education Subscribe 

In [None]:
title = "Few CRE companies have achieved their AI goals. Here's why"
text_content = '''Few CRE companies have achieved their AI goals. Here\'s why Skip Navigation Markets Pre-Markets U.S. Markets Europe Markets China Markets Asia Markets World Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Business Economy Finance Health & Science Media Real Estate Energy Climate Transportation Industrials Retail Wealth Sports Life Small Business Investing Personal Finance Fintech Financial Advisors Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity AI Enterprise Internet Media Mobile Social Media CNBC Disruptor 50 Tech Guide Politics White House Policy Defense Congress Expanding Opportunity Europe Politics China Politics Asia Politics World Politics Video Latest Video Full Episodes Livestream Top Video Live Audio Europe TV Asia TV CNBC Podcasts CEO Interviews Digital Originals Watchlist Investing Club Trust Portfolio Analysis Trade Alerts Meeting Videos Homestretch Jim\'s Columns Education Subscribe PRO Pro News Josh Brown Mike Santoli Calls of the Day My Portfolio Livestream Full Episodes Stock Screener Market Forecast Options Investing Chart Investing Subscribe Livestream Menu Make It select USA INTL Livestream Search quotes, news & videos Livestream Watchlist SIGN IN Create free account Markets Business Investing Tech Politics Video Watchlist Investing Club PRO Livestream Menu Real Estate Housing Construction REITs Rising Risks Newsletter Sign-up CNBC Property Play Just 5% of CRE companies have achieved their AI goals. Here\'s why Published Fri, Oct 31 2025 8:00 AM EDT Updated Fri, Oct 31 2025 8:11 AM EDT Diana Olick @in/dianaolick @DianaOlickCNBC @DianaOlick WATCH LIVE Key Points Real estate companies are moving beyond initial testing and exploration of AI into more targeted applications that aim to redefine value, according to a new survey from JLL. JLL found that 88% of investors, owners and landlords said they have started piloting AI, with most pursuing an average of five use cases simultaneously. Just 5% of respondents said they have achieved all their program goals, while close to half said they have achieved two to three goals. Diminishing perspective of downtown London skyscrapers Chunyip Wong | Istock | Getty Images A version of this article first appeared in the CNBC Property Play newsletter with Diana Olick. Property Play covers new and evolving opportunities for the real estate investor, from individuals to venture capitalists, private equity funds, family offices, institutional investors and large public companies. Sign up to receive future editions, straight to your inbox. The commercial real estate market has been historically slow to modernize, and yet it appears to be accelerating its adoption of artificial intelligence. Companies are moving beyond initial testing and exploration into more targeted applications that aim to redefine value, according to a new survey from JLL. The survey of more than 1,500 senior CRE investor and occupier decision-makers across various industries found that, while still in the early stages, organizations are making AI a priority in their technology budgets. They are also moving from using it just for efficiency to focusing on how it can grow their businesses. JLL found that 88% of investors, owners and landlords said they have started piloting AI, with most pursuing an average of five use cases simultaneously. And more than 90% of occupiers are running corporate real estate AI pilots, according to the report. Compare that with just 5% starting AI pilots two years ago. The adoption is fast, but not entirely easy. Just 5% of respondents said they have achieved all their program goals, while close to half said they have achieved two to three goals. Much of the efforts are still experimental, without much growth. "If you think about commercial real estate, traditionally, it is not a quick technology adopter, and it\'s usually skeptical," said Yao Morin, chief technology officer at JLL. "So the high number of adoptions is actually quite surprising to me. What is not surprising on the flip side is that only 5% actually thinks that they have achieved all the goals. This is pretty aligned with a lot of other industries as well." Get Property Play directly to your inbox CNBC\'s Property Play with Diana Olick covers new and evolving opportunities for the real estate investor, delivered weekly to your inbox. Subscribe here to get access today . The reason they\'re not hitting their goals is because the goal line has moved. Companies have gone beyond just wanting to do certain tasks faster, or so-called operational efficiencies. Now they are tying AI to their revenue goals. For example, some are using it to help them improve their investment risk models, making investment and portfolio decisions based on the output of AI. That will require big changes to the fundamental way they operate. "When you really start moving towards the revenue side, the margin expansion side, then it\'s going to require a lot more than just using a technology," Morin explained. "You can\'t just say, \'Well, I\'m saving you 10% to do this particular thing.\' Companies need to actually rethink their operating model, to rethink how they organize to actually achieve the savings." And so companies are investing heavily in AI, despite economic headwinds. More than half of investors surveyed by JLL have been able to get significant budget growth over the past two years in the space. Their No. 1 spend is on strategic advisory on technology or AI, and most report their budgets have increased solely due to AI. After that, the spending goes to upgrading both cyber- and data-security measures and infrastructure for AI integration. Morin said what she found really surprising is that while most think companies will start using AI for simple tasks, or, low-risk, low-hanging fruit, that was not at all the case. "Our survey showed the opposite. We are getting to a point of sophistication, beyond this initial skeptical phase, where companies are really focusing on the competitive advantage to pressing business problems, using AI to solve instead of [just] those simple low-risk operations." More In CNBC Property Play Why global investment firm Nuveen is betting on this niche real estate subsector Diana Olick Commercial real estate is finally embracing blockchain. Here\'s what investors should know Diana Olick Major real estate developers are fast becoming power brokers Diana Olick Read More Subscribe to CNBC PRO Subscribe to Investing Club Licensing & Reprints CNBC Councils Supply Chain Values CNBC on Peacock Join the CNBC Panel Digital Products News Releases Closed Captioning Corrections About CNBC Internships Site Map Ad Choices Careers Help Contact News Tips Got a confidential news tip? We want to hear from you. Get In Touch CNBC Newsletters Sign up for free newsletters and get more CNBC delivered to your inbox Sign Up Now Get this delivered to your inbox, and more info about our products and services. Advertise With Us Please Contact Us Privacy Policy Your Privacy Choices CA Notice Terms of Service © 2025 CNBC LLC. All Rights Reserved. A Division of NBCUniversal Data is a real-time snapshot *Data is delayed at least 15 minutes.\n      Global Business and Financial News, Stock Quotes, and Market Data\n      and Analysis. Market Data Terms of Use and Disclaimers Data also provided by'''

In [21]:
import re, json, requests
from typing import Optional, Tuple, List
from bs4 import BeautifulSoup

DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0 Safari/537.36"
    )
}

def _first(s: List[Optional[str]]) -> Optional[str]:
    for v in s:
        if v and v.strip():
            return v.strip()
    return None

def _get_meta(soup: BeautifulSoup, names_props: List[Tuple[str, str]]) -> Optional[str]:
    # names_props: list of (attr_name, attr_value) pairs to try, e.g. ("property","og:title")
    for attr, val in names_props:
        tag = soup.find("meta", attrs={attr: val})
        if tag and tag.get("content"):
            return tag["content"].strip()
    return None

def _parse_jsonld_all(soup: BeautifulSoup) -> List[dict]:
    items = []
    for tag in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(tag.string or tag.text or "")
            if isinstance(data, list):
                items.extend([x for x in data if isinstance(x, dict)])
            elif isinstance(data, dict):
                items.append(data)
        except Exception:
            continue
    return items

def _jsonld_pick_article(jsonlds: List[dict]) -> Optional[dict]:
    # Prefer NewsArticle/Article nodes
    for node in jsonlds:
        t = node.get("@type")
        if t == "NewsArticle" or t == "Article" or (isinstance(t, list) and any(x in ("NewsArticle","Article") for x in t)):
            return node
    # Sometimes the article is nested
    for node in jsonlds:
        for k, v in node.items():
            if isinstance(v, dict) and v.get("@type") in ("NewsArticle","Article"):
                return v
            if isinstance(v, list):
                for it in v:
                    if isinstance(it, dict) and it.get("@type") in ("NewsArticle","Article"):
                        return it
    return None

def extract_cnbc(url: str, timeout: float = 20.0) -> Tuple[Optional[str], str]:
    r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
    r.raise_for_status()
    html = r.text
    soup = BeautifulSoup(html, "html.parser")

    # ---------- TITLE ----------
    # 1) OpenGraph / Twitter
    title = _get_meta(soup, [
        ("property", "og:title"),
        ("name", "twitter:title"),
        ("name", "parsely-title"),
    ])
    # 2) JSON-LD headline
    if not title:
        jsonlds = _parse_jsonld_all(soup)
        node = _jsonld_pick_article(jsonlds)
        if node:
            title = _first([node.get("headline"), node.get("name")])
    # 3) H1 fallbacks
    if not title:
        h1 = soup.find("h1")
        if h1:
            title = h1.get_text(" ", strip=True)
    # 4) <title> tag last
    if not title and soup.title:
        title = soup.title.get_text(" ", strip=True)

    # ---------- BODY ----------
    # A) JSON-LD articleBody (best: avoids nav/footers)
    body_text = ""
    if 'jsonlds' not in locals():
        jsonlds = _parse_jsonld_all(soup)
    node = _jsonld_pick_article(jsonlds)
    if node:
        # CNBC often provides articleBody as a string or list of paragraphs
        ab = node.get("articleBody")
        if isinstance(ab, str) and ab.strip():
            body_text = ab.strip()
        elif isinstance(ab, list):
            body_text = "\n\n".join([x for x in ab if isinstance(x, str) and x.strip()])

    # B) DOM fallbacks (scoped to main article region only)
    if not body_text:
        # Try common CNBC containers (several variants over time)
        containers = []
        # data-testid variants
        containers += soup.select('main [data-testid="ArticleBody"], article [data-testid="ArticleBody"]')
        # class name patterns historically used
        containers += soup.select('div[class*="ArticleBody-"], article div[class*="ArticleBody-"]')
        # generic article paragraphs
        if not containers:
            containers += soup.select("main article")
        # collect paragraphs
        paras = []
        for c in containers[:2]:  # don’t over-collect in case of duplicates
            for p in c.find_all(["p","h2","li"]):
                # skip bylines/captions if marked
                txt = p.get_text(" ", strip=True)
                if txt:
                    paras.append(txt)
            if paras:
                break
        body_text = "\n\n".join(paras)

    # C) Very last resort: strip everything (you already tried this—kept here as fallback)
    if not body_text:
        # Remove scripts/styles/noscript
        for t in soup(["script","style","noscript"]):
            t.decompose()
        # Try to confine to <main>
        main = soup.find("main")
        if main:
            body_text = main.get_text(" ", strip=True)
        else:
            body_text = soup.get_text(" ", strip=True)

    return title, body_text

# Example:
# t, content = extract_cnbc("https://www.cnbc.com/2025/10/30/government-shutdown-delta-air-traffic-controllers.html")
# print(t)
# print(content[:1000])


# Test flow

In [None]:
import os
import sys
import re
import logging
from typing import Tuple, Optional
import requests

import google.generativeai as genai
from pydantic_ai import Agent
from pydantic_ai.models.google import GoogleModel
from pydantic_ai.providers.google import GoogleProvider

# Create and configure logger
logging.basicConfig(filename="./logs/newfile.log",
                    format='%(asctime)s %(message)s',
                    filemode='w')
logger = logging.getLogger()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from dotenv import load_dotenv
load_dotenv()

GEMINI_KEY = os.getenv('GOOGLE_API_KEY')
DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0 Safari/537.36"
    )
}

# Import pydant
provider = GoogleProvider(api_key=GEMINI_KEY)
model = GoogleModel('gemini-2.5-flash', provider=provider)

In [None]:
import re
import os
import logging
import asyncio
import httpx
from typing import Tuple, Optional, List, Dict

from pydantic_ai import Agent
from pydantic_ai.models.google import GoogleModel, GoogleProvider

from models import ClassificationResultFromText
from dotenv import load_dotenv
load_dotenv()

# Configure logger
logging.basicConfig(filename="./logs/newfile.log",
                    format='%(asctime)s %(message)s',
                    filemode='w')
logger = logging.getLogger(__name__)

GEMINI_KEY = os.getenv('GOOGLE_API_KEY')
# Default headers for HTTP requests
DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

TIMEOUT = 30
LLM_MODEL = 'gemini-2.5-flash-lite'

class NewsAnalyzer:
    def __init__(self, gemini_key: str):
        self.provider = GoogleProvider(api_key=gemini_key)
        self.model = GoogleModel(LLM_MODEL, provider=self.provider)

        # Create agent with system prompt and output type
        self.agent = Agent(
            self.model,
            output_type=ClassificationResultFromText,
            system_prompt="""You are a professional news analyst specializing in financial and business reporting.
                            Your goal is to interpret a given news article and provide a concise, structured analysis.:
                            Given a news title and content, perform the following tasks clearly and objectively:

                            1. Financial Relevance:
                            - Determine whether the news is related to finance, economy, or markets.
                            - Output: Yes or No.

                            2. Sector Classification:
                            - Identify which industry or sector the news focuses on.
                            - (Examples: Technology, Banking, F&B, Heavy Industry, Manufacturing, Energy, Healthcare, etc.)

                            3. Companies Mentioned:
                            - List all companies, organizations, or indices mentioned in the article.
                            - If none are directly mentioned, state: None.

                            4. Sentiment Analysis:
                            - Classify the overall tone of the news as Positive, Negative, or Neutral.
                            - Provide a confidence score between 1.0 and 10.0, where 10.0 = highest confidence.

                            5. Summaries:
                            - English Summary: 2–3 sentences summarizing the main points.
                            - Turkish Summary: 2–3 sentences summarizing the same in Turkish.

                            Example:
                            China's factory activity growth in October missed market expectations,
                            dragged down by a sharper drop in new export orders,
                            as trade tensions with the U.S. intensified during the month, according to a private survey released Monday.
                            The RatingDog China General Manufacturing PMI, compiled by S&P Global,
                            dropped to 50.6 in October from the six-month high of 51.2 in September,
                            missing analysts' expectations of 50.9 in a Reuters poll.
                            New export orders fell at the quickest pace since May,
                            which the survey respondents attributed to "rising trade uncertainty.

                            Output:
                            - Financial Check: Yes
                            - Sector: Manufacturing, Industrial Production
                            - Companies Mentioned: S&P Global, RatingDog China General Manufacturing PMI
                            - Sentiment Classification: Negative
                            Reason: Factory growth slowed, missed expectations, and export orders fell sharply amid U.S. trade tensions.
                            - Confidence Score: 9.0 / 10.0
                            - English Summary:
                            China's manufacturing sector lost momentum in October as the RatingDog China General Manufacturing PMI slipped to 50.6, below expectations. New export orders dropped at the fastest rate since May due to growing trade tensions with the U.S., signaling mounting pressure on the industrial economy.
                            - Turkish Summary:
                            Çin'in imalat sektörü Ekim ayında ivme kaybetti. RatingDog Çin Genel İmalat PMI endeksi 50,6'ya gerileyerek beklentilerin altında kaldı. Yeni ihracat siparişleri, ABD ile artan ticaret gerilimi nedeniyle Mayıs ayından bu yana en hızlı düşüşünü yaşadı.
                            Bu durum, sanayi ekonomisi üzerindeki baskının arttığını gösteriyor."""
        )

        # Test the agent with a simple query
        logger.info("✓ Pydantic AI Agent configured successfully")

    async def llm_analyzer(
        self, contents: str, title: str, timeout: float = TIMEOUT
    ) -> ClassificationResultFromText:
        """Analyze news content with LLM. Includes timeout protection for long inference."""
        user_message = f"- Title: {title}\n- Contents: {contents}"

        try:
            logger.info(f"Analyzing {len(contents)} chars of text via LLM")

            response = await asyncio.wait_for(self.agent.run(user_message), timeout=timeout)

            return ClassificationResultFromText(
                page_title=title,
                is_financial=response.output.is_financial,
                country=getattr(response.output, "country", None),
                sector=response.output.sector,
                companies=response.output.companies,
                confident_score=response.output.confident_score,
                sentiment=response.output.sentiment,
                summary_en=response.output.summary_en,
                summary_tr=response.output.summary_tr,
                extracted_characters=len(contents or ""),
            )

        except asyncio.TimeoutError:
            logger.error("LLM analysis timed out.")
            raise TimeoutError(
                f"LLM analysis exceeded timeout of {timeout} seconds.")
        except Exception as e:
            logger.exception(f"Error during LLM analysis: {e}")
            raise

    async def analyze_with_url(self, url: str, timeout=TIMEOUT) -> ClassificationResultFromText:
        """Complete analysis pipeline: extract URL content and analyze with LLM."""
        title, text = await self.extract_url(url)
        llm_output = await self.llm_analyzer(contents=text, title=title, timeout=TIMEOUT)

        return llm_output

    async def analyze_with_contents(self, text: str, title: str, timeout=TIMEOUT) -> ClassificationResultFromText:
        """Complete analysis pipeline: analyze with text and title"""
        llm_output = await self.llm_analyzer(contents=text, title=title)

        return llm_output

def get_analyzer() -> NewsAnalyzer:
    """Get or create the NewsAnalyzer singleton instance."""
    api_key = os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError(
            "Missing API key. Set GOOGLE_API_KEY or GEMINI_API_KEY environment variable."
        )
    analyzer = NewsAnalyzer(gemini_key=api_key)
    return analyzer


In [9]:
analyzer = get_analyzer()



In [None]:
title = "Few CRE companies have achieved their AI goals. Here's why"
text_content = '''Few CRE companies have achieved their AI goals. Here\'s why Skip Navigation Markets Pre-Markets U.S. Markets Europe Markets China Markets Asia Markets World Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Business Economy Finance Health & Science Media Real Estate Energy Climate Transportation Industrials Retail Wealth Sports Life Small Business Investing Personal Finance Fintech Financial Advisors Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity AI Enterprise Internet Media Mobile Social Media CNBC Disruptor 50 Tech Guide Politics White House Policy Defense Congress Expanding Opportunity Europe Politics China Politics Asia Politics World Politics Video Latest Video Full Episodes Livestream Top Video Live Audio Europe TV Asia TV CNBC Podcasts CEO Interviews Digital Originals Watchlist Investing Club Trust Portfolio Analysis Trade Alerts Meeting Videos Homestretch Jim\'s Columns Education Subscribe PRO Pro News Josh Brown Mike Santoli Calls of the Day My Portfolio Livestream Full Episodes Stock Screener Market Forecast Options Investing Chart Investing Subscribe Livestream Menu Make It select USA INTL Livestream Search quotes, news & videos Livestream Watchlist SIGN IN Create free account Markets Business Investing Tech Politics Video Watchlist Investing Club PRO Livestream Menu Real Estate Housing Construction REITs Rising Risks Newsletter Sign-up CNBC Property Play Just 5% of CRE companies have achieved their AI goals. Here\'s why Published Fri, Oct 31 2025 8:00 AM EDT Updated Fri, Oct 31 2025 8:11 AM EDT Diana Olick @in/dianaolick @DianaOlickCNBC @DianaOlick WATCH LIVE Key Points Real estate companies are moving beyond initial testing and exploration of AI into more targeted applications that aim to redefine value, according to a new survey from JLL. JLL found that 88% of investors, owners and landlords said they have started piloting AI, with most pursuing an average of five use cases simultaneously. Just 5% of respondents said they have achieved all their program goals, while close to half said they have achieved two to three goals. Diminishing perspective of downtown London skyscrapers Chunyip Wong | Istock | Getty Images A version of this article first appeared in the CNBC Property Play newsletter with Diana Olick. Property Play covers new and evolving opportunities for the real estate investor, from individuals to venture capitalists, private equity funds, family offices, institutional investors and large public companies. Sign up to receive future editions, straight to your inbox. The commercial real estate market has been historically slow to modernize, and yet it appears to be accelerating its adoption of artificial intelligence. Companies are moving beyond initial testing and exploration into more targeted applications that aim to redefine value, according to a new survey from JLL. The survey of more than 1,500 senior CRE investor and occupier decision-makers across various industries found that, while still in the early stages, organizations are making AI a priority in their technology budgets. They are also moving from using it just for efficiency to focusing on how it can grow their businesses. JLL found that 88% of investors, owners and landlords said they have started piloting AI, with most pursuing an average of five use cases simultaneously. And more than 90% of occupiers are running corporate real estate AI pilots, according to the report. Compare that with just 5% starting AI pilots two years ago. The adoption is fast, but not entirely easy. Just 5% of respondents said they have achieved all their program goals, while close to half said they have achieved two to three goals. Much of the efforts are still experimental, without much growth. "If you think about commercial real estate, traditionally, it is not a quick technology adopter, and it\'s usually skeptical," said Yao Morin, chief technology officer at JLL. "So the high number of adoptions is actually quite surprising to me. What is not surprising on the flip side is that only 5% actually thinks that they have achieved all the goals. This is pretty aligned with a lot of other industries as well." Get Property Play directly to your inbox CNBC\'s Property Play with Diana Olick covers new and evolving opportunities for the real estate investor, delivered weekly to your inbox. Subscribe here to get access today . The reason they\'re not hitting their goals is because the goal line has moved. Companies have gone beyond just wanting to do certain tasks faster, or so-called operational efficiencies. Now they are tying AI to their revenue goals. For example, some are using it to help them improve their investment risk models, making investment and portfolio decisions based on the output of AI. That will require big changes to the fundamental way they operate. "When you really start moving towards the revenue side, the margin expansion side, then it\'s going to require a lot more than just using a technology," Morin explained. "You can\'t just say, \'Well, I\'m saving you 10% to do this particular thing.\' Companies need to actually rethink their operating model, to rethink how they organize to actually achieve the savings." And so companies are investing heavily in AI, despite economic headwinds. More than half of investors surveyed by JLL have been able to get significant budget growth over the past two years in the space. Their No. 1 spend is on strategic advisory on technology or AI, and most report their budgets have increased solely due to AI. After that, the spending goes to upgrading both cyber- and data-security measures and infrastructure for AI integration. Morin said what she found really surprising is that while most think companies will start using AI for simple tasks, or, low-risk, low-hanging fruit, that was not at all the case. "Our survey showed the opposite. We are getting to a point of sophistication, beyond this initial skeptical phase, where companies are really focusing on the competitive advantage to pressing business problems, using AI to solve instead of [just] those simple low-risk operations." More In CNBC Property Play Why global investment firm Nuveen is betting on this niche real estate subsector Diana Olick Commercial real estate is finally embracing blockchain. Here\'s what investors should know Diana Olick Major real estate developers are fast becoming power brokers Diana Olick Read More Subscribe to CNBC PRO Subscribe to Investing Club Licensing & Reprints CNBC Councils Supply Chain Values CNBC on Peacock Join the CNBC Panel Digital Products News Releases Closed Captioning Corrections About CNBC Internships Site Map Ad Choices Careers Help Contact News Tips Got a confidential news tip? We want to hear from you. Get In Touch CNBC Newsletters Sign up for free newsletters and get more CNBC delivered to your inbox Sign Up Now Get this delivered to your inbox, and more info about our products and services. Advertise With Us Please Contact Us Privacy Policy Your Privacy Choices CA Notice Terms of Service © 2025 CNBC LLC. All Rights Reserved. A Division of NBCUniversal Data is a real-time snapshot *Data is delayed at least 15 minutes.\n      Global Business and Financial News, Stock Quotes, and Market Data\n      and Analysis. Market Data Terms of Use and Disclaimers Data also provided by'''
test = await analyzer.analyze_with_contents(text=text_content, title=title)

In [10]:
output = test.model_dump()
output

{'page_title': "Few CRE companies have achieved their AI goals. Here's why",
 'is_financial': 'Yes',
 'country': [],
 'sector': [],
 'companies': [],
 'confident_score': 9.5,
 'sentiment': 'Neutral',
 'summary_en': 'A recent JLL survey indicates that while commercial real estate (CRE) companies are increasingly adopting AI, with 88% piloting it for an average of five use cases, only 5% have fully achieved their AI goals. This is attributed to the moving goalposts, as companies now aim to tie AI to revenue and business growth rather than just operational efficiencies, requiring fundamental changes to operating models.',
 'summary_tr': 'Yeni bir JLL anketine göre, ticari gayrimenkul (TG) şirketleri giderek daha fazla YZ benimsemesine rağmen (ortalamada beş kullanım durumu için pilot uygulama yapanların %88',
 'extracted_characters': 7365}

In [11]:
url = 'https://www.cnbc.com/2025/10/30/government-shutdown-delta-air-traffic-controllers.html'
title, contents = await analyzer.extract_url(url=url)

In [16]:
contents

'Delta, United and American call on Congress to end government shutdown Skip Navigation Markets Pre-Markets U.S. Markets Europe Markets China Markets Asia Markets World Markets Currencies Cryptocurrency Futures &amp; Commodities Bonds Funds &amp; ETFs Business Economy Finance Health &amp; Science Media Real Estate Energy Climate Transportation Industrials Retail Wealth Sports Life Small Business Investing Personal Finance Fintech Financial Advisors Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity AI Enterprise Internet Media Mobile Social Media CNBC Disruptor 50 Tech Guide Politics White House Policy Defense Congress Expanding Opportunity Europe Politics China Politics Asia Politics World Politics Video Latest Video Full Episodes Livestream Top Video Live Audio Europe TV Asia TV CNBC Podcasts CEO Interviews Digital Originals Watchlist Investing Club Trust Portfolio Analysis Trade Alerts Meeting Videos Homestretch Jim&#x27;s Columns Education Subscribe P

In [None]:
from dataclasses import dataclass
from typing import List, Literal, Optional

Plan = Literal["batch", "standard"]

# === PRICES: text-only (from your screenshots) ===
# Standard:  Input $0.30/M, Output $2.50/M, Cache write $0.03/M, Storage $1.00/M/hour
# Batch:     Input $0.15/M, Output $1.25/M, Cache write $0.03/M, Storage $1.00/M/hour

@dataclass(frozen=True)
class TextPrices:
    input_per_million: float
    output_per_million: float
    cache_write_per_million: float
    cache_storage_per_million_per_hour: float

PRICES_TEXT = {
    "standard": TextPrices(
        input_per_million=0.30,
        output_per_million=2.50,
        cache_write_per_million=0.03,
        cache_storage_per_million_per_hour=1.00,
    ),
    "batch": TextPrices(
        input_per_million=0.15,
        output_per_million=1.25,
        cache_write_per_million=0.03,
        cache_storage_per_million_per_hour=1.00,
    ),
}

GROUNDING_SEARCH_USD_PER_1000 = 35.00  # both plans; batch maps not available
TOKEN_SCALE = 1_000_000.0


def tokens_to_usd(tokens: int, usd_per_million: float) -> float:
    return (tokens / TOKEN_SCALE) * usd_per_million


def calc_text_cost_usd(
    *,
    plan: Plan,                 # "batch" (your use-case) or "standard"
    prompt_tokens: int,         # total input tokens for ONE request (system + title + content + etc.)
    output_tokens: int,         # output tokens for ONE request
    cache_write_tokens: int = 0,
    cache_storage_tokens: int = 0,
    cache_storage_hours: float = 0.0,
    grounded_search_prompts_paid: int = 0,  # after subtracting free quota
) -> dict:
    """
    Cost for a single **text-only** request.
    """
    p = PRICES_TEXT[plan]

    input_cost = tokens_to_usd(prompt_tokens, p.input_per_million)
    output_cost = tokens_to_usd(output_tokens, p.output_per_million)
    cache_write_cost = tokens_to_usd(cache_write_tokens, p.cache_write_per_million) if cache_write_tokens else 0.0

    cache_storage_cost = 0.0
    if cache_storage_tokens and cache_storage_hours:
        per_hour = tokens_to_usd(cache_storage_tokens, p.cache_storage_per_million_per_hour)
        cache_storage_cost = per_hour * cache_storage_hours

    grounding_search_cost = (grounded_search_prompts_paid / 1000.0) * GROUNDING_SEARCH_USD_PER_1000

    total = round(input_cost + output_cost + cache_write_cost + cache_storage_cost + grounding_search_cost, 6)

    return {
        "plan": plan,
        "input_tokens": prompt_tokens,
        "output_tokens": output_tokens,
        "costs": {
            "input_usd": round(input_cost, 6),
            "output_usd": round(output_cost, 6),
            "cache_write_usd": round(cache_write_cost, 6),
            "cache_storage_usd": round(cache_storage_cost, 6),
            "grounding_search_usd": round(grounding_search_cost, 6),
        },
        "total_usd": total,
        "rates_per_1M": {
            "input": p.input_per_million,
            "output": p.output_per_million,
            "cache_write": p.cache_write_per_million,
            "cache_storage_per_hour": p.cache_storage_per_million_per_hour,
            "grounding_search_per_1000": GROUNDING_SEARCH_USD_PER_1000,
        },
    }


def count_total_characters(data) -> int:
    """
    Recursively count total characters across all string values in a nested dict/list structure.
    Non-string fields (numbers, booleans, None) are ignored.

    Example:
        count_total_characters({"a": "Hello", "b": ["World", 123]})  # -> 10
    """
    total_chars = 0

    if isinstance(data, str):
        total_chars += len(data)

    elif isinstance(data, dict):
        for value in data.values():
            total_chars += count_total_characters(value)

    elif isinstance(data, list):
        for item in data:
            total_chars += count_total_characters(item)

    return total_chars


count_total_characters(output)

648

In [None]:
test_link = ['https://www.cnbc.com/2025/11/03/china-factory-activity-october-pmi-ratingdog-private-survey-shows.html',
    "https://www.reuters.com/world/asia-pacific/dollar-flirts-with-three-month-peak-investors-look-us-data-releases-2025-11-03/?utm_source=chatgpt.com",
    "https://www.reuters.com/world/asia-pacific/chinas-finance-ministry-sets-up-new-debt-management-department-2025-11-03/?utm_source=chatgpt.com",
    "https://www.fxstreet.com/news/japanese-yen-languishes-near-multi-month-low-against-bullish-usd-amid-boj-uncertainty-202511030249?utm_source=chatgpt.com",
    "https://www.fxstreet.com/news/eur-usd-holds-losses-below-11550-as-fed-rate-cut-bets-decrease-202511030104?utm_source=chatgpt.com",
    "https://finance.yahoo.com/news/3-brilliant-growth-stocks-buy-002300772.html?utm_source=chatgpt.com",
    "https://www.ft.com/content/a281b378-687e-498e-acfd-ebeebd1e7cf8?utm_source=chatgpt.com",
    "https://www.reuters.com/world/middle-east/major-gulf-bourses-fall-weak-earnings-us-rate-cut-uncertainty-2025-11-03/?utm_source=chatgpt.com",
    "https://www.reuters.com/business/us-bancorp-quarterly-profit-jumps-higher-fee-income-2025-10-16/?utm_source=chatgpt.com",
    "https://www.reuters.com/business/finance/bank-america-picks-manelski-zuberi-run-global-markets-unit-memo-shows-2025-10-31/?utm_source=chatgpt.com",
    "https://www.reuters.com/business/finance/us-banks-reap-bigger-profits-deals-rebound-third-quarter-2025-10-09/?utm_source=chatgpt.com",
    "https://www.reuters.com/business/finance/us-banking-giants-expect-dealmaking-spree-continue-profits-climb-2025-10-14/?utm_source=chatgpt.com",
    "https://www.barrons.com/articles/stock-futures-trading-november-f84e36e0?utm_source=chatgpt.com",
    "https://www.tradingview.com/news/tradingview%3A426d631d60a9c%3A0-key-facts-analysts-predict-dogecoin-breakout-targets-up-to-48-rsi-stable/?utm_source=chatgpt.com",
    "https://www.reuters.com/business/finance/australian-lender-westpacs-annual-profit-falls-2-2025-11-02/?utm_source=chatgpt.com",
    "https://www.reuters.com/business/finance/us-bank-profits-climb-regulator-adjusts-problem-bank-tracking-2025-02-25/?utm_source=chatgpt.com",
    "https://www.cnbc.com/2025/11/03/why-the-us-dollar-has-been-strong-lately.html",
    "https://www.cnbc.com/2025/11/02/bank-of-america-q3-earnings-what-to-watch.html",
    "https://www.cnbc.com/2025/11/01/fed-rate-cut-bets-and-what-it-means-for-markets.html",
    "https://www.cnbc.com/2025/10/31/china-local-government-debt-what-investors-need-to-know.html",
    "https://www.cnbc.com/2025/10/30/how-big-tech-earnings-are-influencing-broader-market-sentiment.html"
]


# Test with 20 contents

In [None]:
values = []
for i in range(0, 20):
    tmp = {
        'id': f'id_{i}',
        'title': f'test_{i}',
        'contents': contents
    }
    values.append(tmp)


In [1]:
import os
from batch_processor import BatchProcessor
import json
from dotenv import load_dotenv
load_dotenv()

GEMINI_KEY = os.getenv('GOOGLE_API_KEY')

from batch_processor import BatchProcessor
import json

# Load content
with open('batch_contents.json') as f:
    contents = json.load(f)

# Initialize processor
processor = BatchProcessor(gemini_key=GEMINI_KEY)

# Prepare and submit batch
batch_file = processor.prepare_batch_from_contents(contents, "my_batch")
job_id = processor.submit_batch(batch_file)  # Should work now!

# Wait for completion
processor.wait_for_completion(job_id)

# Get results
results = processor.retrieve_results(job_id, "my_batch")



ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource has been exhausted (e.g. check quota).', 'status': 'RESOURCE_EXHAUSTED'}}