In [2]:
# ---### 1. INSTALL AND IMPORT LIBRARIES ###---
!pip install -q google-generativeai
!pip install feedparser

import google.generativeai as genai
import feedparser
from google.colab import userdata
from datetime import datetime, timedelta
import re
import warnings
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from transformers import pipeline, logging as hf_logging
import torch
from IPython.display import HTML
import urllib.parse
import hashlib

# ---### 2. GLOBAL SETTINGS & API CONFIGURATION ###---
warnings.filterwarnings('ignore')
hf_logging.set_verbosity_error()

try:
    # We only need the Google API Key
    GEMINI_API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GEMINI_API_KEY)
    print("✅ Successfully configured Gemini API.")
    gemini_is_configured = True
except Exception as e:
    print(f"🔴 FATAL: GOOGLE_API_KEY not found in Colab Secrets. Please check your setup. Error: {e}")
    gemini_is_configured = False

# ---### 3. AI MODEL INITIALIZATION ###---
print("🚀 Initializing Dashboard...")
if 'ner' not in globals():
    print("🔄 Loading local NER model...")
    ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1)
    print("✅ NER model loaded!")


# ---### 4. CONSTANTS AND AI FUNCTIONS (MODIFIED FOR DNV) ###---
COUNTRY_TARGETS = [
    # MODIFIED: Removed 'Vattenfall', added 'DNV' as a more relevant keyword
    {'country': 'Germany', 'lang': 'de', 'gl': 'DE', 'keywords': ['DNV', 'Erneuerbare Energien', 'Zertifizierung', 'Netzausbau']},
    {'country': 'Sweden', 'lang': 'sv', 'gl': 'SE', 'keywords': ['DNV', 'Förnybar energi', 'certifiering', 'energisystem']},
    {'country': 'Netherlands', 'lang': 'nl', 'gl': 'NL', 'keywords': ['DNV', 'Hernieuwbare energie', 'certificering', 'Stroomnet']},
    {'country': 'United Kingdom', 'lang': 'en', 'gl': 'GB', 'keywords': ['DNV', 'Renewable energy', 'certification', 'risk management']},
    # {'country': 'USA', 'lang': 'en', 'gl': 'GB', 'keywords': ['DNV', 'Renewable energy', 'certification', 'risk management']},
    # NEW: Added Norway, DNV's home country
    {'country': 'Norway', 'lang': 'no', 'gl': 'NO', 'keywords': ['DNV', 'fornybar energi', 'sertifisering', 'energisystemer']}
]
ENERGY_TYPES = ['solar', 'wind', 'offshore wind', 'onshore wind', 'hydrogen', 'hydroelectric', 'geothermal', 'biomass', 'battery storage', 'grid storage', 'tidal', 'wave energy', 'nuclear', 'carbon capture', 'grid']
# MODIFIED: Added DNV, removed Vattenfall
COMPANIES = ['DNV', 'Tesla', 'NextEra Energy', 'Vestas', 'Siemens Gamesa', 'GE Renewable Energy', 'Orsted', 'Ørsted', 'Enel', 'Iberdrola', 'EDF', 'RWE', 'E.ON', 'National Grid', 'Ofgem', 'TenneT']

def translate_text_with_gemini(text, source_lang):
    if source_lang == 'en' or not text.strip(): return text
    if not gemini_is_configured: return f"[Translation skipped due to missing API key]"
    print(f"      -> Translating from '{source_lang}'...")

    model = genai.GenerativeModel('gemini-1.5-flash'); prompt = f"Translate the following text from language code '{source_lang}' to professional, clear English:\n\n{text}"
    try: response = model.generate_content(prompt, request_options={'timeout': 100}); return response.text
    except Exception as e: return f"[Translation failed: {e}]"
    finally: time.sleep(3)

def summarize_with_gemini(text_to_summarize, max_words=100):
    if not gemini_is_configured or not text_to_summarize.strip() or "[Translation failed]" in text_to_summarize:
        return "Summary could not be generated."

    model = genai.GenerativeModel('gemini-2.5-flash-preview-05-20')
    # This prompt is generic and still works well for DNV
    prompt = f"""As a market intelligence analyst for an energy and assurance company, your task is to summarize the provided news article text in about {max_words} words.
Your first priority is to focus strictly on market insights, new projects, regulatory changes, and key financial outcomes.
If that specific information is absent, then pivot and provide a concise summary of the article's key facts and main outcomes from a market intelligence or strategic perspective.
"""
    final_prompt = f"{prompt}\n\nNews Article Text: \"{text_to_summarize}\""

    try:
        response = model.generate_content(final_prompt, request_options={'timeout': 100})
        return response.text
    except Exception as e:
        return f"Summary could not be generated due to an API error: {e}"
    finally:
        time.sleep(3)

def get_strategic_insight(text_summary, company="DNV"):
    if not gemini_is_configured or "could not be generated" in text_summary or "Not enough successful summaries" in text_summary: return "Strategic insight could not be generated because the executive summary was unavailable."
    print(f"🤖 Generating Strategic Insight for {company}..."); model = genai.GenerativeModel('gemini-2.5-flash-preview-05-20')
    # MODIFIED: Prompt is now from DNV's perspective
    prompt = f"As a strategy advisor for DNV, an assurance and risk management company, what are the key strategic threats and opportunities (e.g., for new services, certifications, or advisory roles) based only on this news summary? Be concise and use bullet points.\n\nNews Summary: \"{text_summary}\"\n\nStrategic Analysis:"
    try: response = model.generate_content(prompt); return response.text
    except Exception as e: return f"Strategic insight could not be generated due to an API error: {e}"
    finally: time.sleep(3)

# ---### 5. CORE PROCESSING & HELPER FUNCTIONS ###---
def process_article(article):
    print(f"   -> Processing '{article['title'][:50]}...' from {article['country']}"); scraped_text = get_full_article_text(article['link']); text_for_analysis = scraped_text if scraped_text else article['raw_summary']
    if not text_for_analysis or len(text_for_analysis.strip()) < 50:
        print(f"      -> No usable content found. Skipping AI."); return {'published_date': article['published_date'], 'title': article['title'], 'source': f"{article['source']} ({article['country']})", 'link': article['link'], 'summary': "Article content could not be retrieved.", 'companies': 'N/A', 'energy_types': 'N/A', 'amounts': 'N/A', 'news_nature': 'Unknown'}
    translated_title = translate_text_with_gemini(article['title'], article['language']); translated_body = translate_text_with_gemini(text_for_analysis, article['language']); summary = summarize_with_gemini(translated_body)
    text_for_ner = f"{translated_title}. {summary}"; companies = extract_companies(text_for_ner); energy_types = extract_energy_types(text_for_ner); amounts = extract_amounts(text_for_ner); nature = classify_news_nature(translated_title, summary)
    return {'published_date': article['published_date'], 'title': article['title'], 'source': f"{article['source']} ({article['country']})", 'link': article['link'], 'summary': summary, 'companies': ', '.join(companies) or 'Not specified', 'energy_types': ', '.join(energy_types) or 'Not specified', 'amounts': ', '.join(amounts) or 'Not specified', 'news_nature': nature}

def fetch_and_process_news():
    print("📰 Fetching targeted, multilingual news..."); all_articles = []
    for target in COUNTRY_TARGETS:
        try:
            query = " OR ".join(f'"{kw}"' for kw in target['keywords']); encoded_query = urllib.parse.quote_plus(query); url = f"https://news.google.com/rss/search?q={encoded_query}&hl={target['lang']}&gl={target['gl']}&ceid={target['gl']}:{target['lang']}"
            print(f"   📡 Searching in {target['country']}..."); feed = feedparser.parse(url)
            for entry in feed.entries[:4]: all_articles.append({'source': entry.get('source', {}).get('title', 'Google News'), 'title': entry.get('title', ''), 'link': entry.get('link', ''), 'published_date': parse_date(entry), 'raw_summary': entry.get('summary', ''), 'language': target['lang'], 'country': target['country']})
        except Exception as e: print(f"   ❌ Error with {target['country']}: {e}")
    print(f"✅ Fetched {len(all_articles)} total articles."); processed_articles = []; print(f"\n🧠 Processing {len(all_articles)} articles (this will take a very long time due to API pauses)...")
    for article in all_articles:
        processed_articles.append(process_article(article))
    return pd.DataFrame(processed_articles)

def generate_executive_summary(df):
    print("📄 Generating Executive Summary..."); summaries = [s for s in df['summary'] if "could not be generated" not in s and "could not be retrieved" not in s]
    if not summaries: return "Not enough successful summaries to generate an executive summary."
    full_text = ". ".join(summaries); return summarize_with_gemini(full_text, max_words=250)

def get_full_article_text(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}; response = requests.get(url, headers=headers, timeout=15); response.raise_for_status(); soup = BeautifulSoup(response.content, 'html.parser'); paragraphs = soup.find_all('p'); return ' '.join([p.get_text() for p in paragraphs])
    except Exception: return None

def parse_date(entry):
    parsed_date_tuple = entry.get('published_parsed') or entry.get('updated_parsed');
    if parsed_date_tuple:
        try: dt_object = datetime(*parsed_date_tuple[:6]); return dt_object.strftime('%Y-%m-%d %H:%M')
        except Exception: pass
    return datetime.now().strftime('%Y-%m-%d %H:%M')

def extract_companies(text):
    found = {c for c in COMPANIES if re.search(r'\b' + re.escape(c) + r'\b', text, re.I)};
    try:
        entities = ner(text[:512]);
        for ent in entities:
            if ent['entity_group'] == 'ORG' and ent['score'] > 0.9: found.add(ent['word'].strip())
    except: pass
    return list(found)

def extract_energy_types(text):
    return list({t.title() for t in ENERGY_TYPES if re.search(r'\b' + re.escape(t) + r'\b', text.lower())})

def extract_amounts(text):
    patterns = [r'(?:€|£|\$)\s*[\d,.]+\s*(?:billion|million|thousand|B|M|K|bn|m|k)?\b', r'\b\d[\d,.]+\s*(?:billion|million|thousand)\s*(?:dollars?|euros?|pounds?)\b', r'\b\d[\d,.]*\s*(?:MW|GW|kW|TW|MWh|GWh|kWh)\b']; all_matches = set();
    for p in patterns: all_matches.update(re.findall(p, text, re.I));
    return list(all_matches)

def classify_news_nature(title, summary):
    text = (title + " " + summary).lower(); cats = {'Investment/Funding': ['investment', 'funding', 'raise', 'capital', 'acquire', 'sale', 'purchase'], 'Partnership/Deal': ['partnership', 'deal', 'agreement', 'collaboration', 'joint venture'], 'Project Development': ['project', 'launch', 'construct', 'develop', 'plant', 'delay'], 'Regulatory/Policy': ['regulation', 'policy', 'government', 'law', 'tariff', 'subsidy', 'auction'], 'Technology/Innovation': ['technology', 'innovation', 'breakthrough', 'research'], 'Market Update': ['market', 'price', 'forecast', 'report', 'capacity', 'grid']};
    for cat, kws in cats.items():
        if any(kw in text for kw in kws): return cat
    return 'General News'


# ---### 6. HTML GENERATION FUNCTION (MODIFIED FOR DNV) ###---
def generate_html_file(df, exec_summary, strategic_insight, filename="index.html"):
    print(f"✍️ Generating HTML file: {filename}...")

    # Process the strategic insight text to convert Markdown to HTML
    insight_html = ""
    sections = strategic_insight.split('**')
    in_list = False
    for i, part in enumerate(sections):
        part = part.strip()
        if not part: continue

        if i % 2 != 0: # This is a section title
            if in_list: insight_html += "</ul>"; in_list = False
            insight_html += f"<h4>{part.strip(':')}</h4>"
        else: # This is the content with bullet points
            bullets = [item.strip() for item in part.strip().split('*') if item.strip()]
            if bullets:
                if not in_list: insight_html += "<ul>"; in_list = True
                for item in bullets: insight_html += f"<li>{item}</li>"
    if in_list: insight_html += "</ul>"
    if not insight_html.strip(): insight_html = f"<p>{strategic_insight.replace('*', '<br>')}</p>"

    # The main HTML structure
    html = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>DNV Intelligence Dashboard</title> <link rel="preconnect" href="https://fonts.googleapis.com">
        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
        <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap" rel="stylesheet">
        <style>
            :root {{
                --brand-primary: #0077b6; --brand-dark: #023e8a; --text-primary: #212529;
                --text-secondary: #495057; --bg-main: #f8f9fa; --bg-card: #ffffff;
                --border-color: #dee2e6; --shadow-color: rgba(0, 0, 0, 0.08);
            }}
            body {{ font-family: 'Inter', sans-serif; background-color: var(--bg-main); color: var(--text-primary); margin: 0; padding: 24px; }}
            .container {{ max-width: 950px; margin: auto; }}
            .header {{ background: var(--brand-dark); color: white; padding: 24px 30px; border-radius: 12px; text-align: center; margin-bottom: 30px; }}
            .header h1 {{ margin: 0; font-size: 32px; }}
            .header p {{ margin: 8px 0 0 0; opacity: 0.9; }}
            .insight-section {{ background: var(--bg-card); border: 1px solid var(--border-color); padding: 28px; margin-bottom: 30px; border-radius: 12px; box-shadow: 0 4px 12px var(--shadow-color); }}
            .insight-section h2 {{ margin-top: 0; color: var(--brand-dark); font-size: 24px; border-bottom: 1px solid var(--border-color); padding-bottom: 12px; margin-bottom: 20px; }}
            .insight-section p, .insight-section ul {{ line-height: 1.7; font-size: 16px; word-wrap: break-word; }}
            .insight-section ul {{ padding-left: 20px; margin-top: 0; }}
            .insight-section li {{ margin-bottom: 10px; }}
            .insight-section h4 {{ font-size: 18px; margin-bottom: 10px; margin-top: 20px; color: var(--brand-dark); }}
            .card {{ background: var(--bg-card); border: 1px solid var(--border-color); border-radius: 12px; padding: 24px; margin-bottom: 20px; box-shadow: 0 4px 12px var(--shadow-color); transition: transform 0.2s ease-in-out, box-shadow 0.2s ease-in-out; }}
            .card:hover {{ transform: translateY(-5px); box-shadow: 0 8px 20px rgba(0, 0, 0, 0.12); }}
            .card h3 a {{ text-decoration: none; color: var(--brand-primary); }}
            .card .meta {{ color: var(--text-secondary); font-size: 14px; margin-bottom: 16px; display: flex; align-items: center; flex-wrap: wrap; gap: 10px; }}
            .card .tag {{ background-color: #e9ecef; color: #495057; padding: 4px 10px; border-radius: 16px; font-size: 12px; font-weight: 500; }}
            .card .summary {{ line-height: 1.6; }}
            .card .details {{ background: #f8f9fa; padding: 16px; border-radius: 8px; margin-top: 20px; font-size: 14px; display: grid; grid-template-columns: 1fr 1fr; gap: 12px; border: 1px solid #e9ecef; }}
            h2.section-title {{ font-size: 26px; color: var(--text-primary); border-bottom: 2px solid var(--border-color); padding-bottom: 12px; margin-top: 40px; }}
            footer {{ text-align: center; margin-top: 40px; color: #888; font-size: 14px; }}
        </style>
    </head>
    <body>
        <div class="container">
            <header class="header">
                <h1>DNV Intelligence Dashboard</h1> <p>AI-Powered Market News Analysis | Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S CEST')}</p>
            </header>
            <div class="insight-section">
                <h2>Executive Summary</h2>s
                <p>{exec_summary}</p>
            </div>
            <div class="insight-section">
                <h2>Strategic Insight for DNV</h2> {insight_html}
            </div>
            <h2 class="section-title">Market News</h2>
    """
    for _, article in df.iterrows():
        html += f"""
            <div class="card">
                <h3><a href="{article['link']}" target="_blank">{article['title']}</a></h3>
                <div class="meta">
                    <span>📅 {article['published_date']}</span>
                    <span>|</span>
                    <span>📰 {article['source']}</span>
                    <span class="tag">{article['news_nature']}</span>
                </div>
                <p class="summary">{article['summary']}</p>
                <div class="details">
                    <div><strong>🏢 Companies:</strong> {article['companies']}</div>
                    <div><strong>⚡ Energy Type:</strong> {article['energy_types']}</div>
                    <div style="grid-column: span 2;"><strong>💰 Amounts:</strong> {article['amounts']}</div>
                </div>
            </div>
        """
    html += """
            <footer>Generated with Gemini. This is a static page.</footer>
        </div>
    </body>
    </html>
    """
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(html)
    print(f"✅ Successfully generated HTML file: {filename}")


# ---### 7. MAIN EXECUTION BLOCK ###---
if __name__ == '__main__':
    if not gemini_is_configured:
        print("🔴 Please set up your GOOGLE_API_KEY in Colab Secrets to run the dashboard.")
    else:
        processed_df = fetch_and_process_news()

        # --- Filtering Logic ---
        if not processed_df.empty:
            print("\n" + "-"*50)
            print("🔍 Filtering out articles that could not be summarized...")
            filter_phrases = [
                'unable to provide a summary', 'does not include the actual content',
                'the provided text only contains', 'please provide the full text',
                'article content could not be retrieved', 'no market-specific information was found'
            ]
            filter_pattern = '|'.join(filter_phrases)
            original_count = len(processed_df)
            processed_df = processed_df[~processed_df['summary'].str.contains(filter_pattern, case=False, na=False)]
            filtered_count = original_count - len(processed_df)
            if filtered_count > 0: print(f"✅ Filtered out {filtered_count} failed articles.")
            else: print("✅ No failed articles needed filtering.")
            print("-" * 50 + "\n")

        if not processed_df.empty:
            executive_summary = generate_executive_summary(processed_df)
            # MODIFIED: Changed company context to DNV
            strategic_insight = get_strategic_insight(executive_summary, company="DNV")

            # Create the dashboard file
            dashboard_filename = "DNV_StrategicNewsletter.html"
            generate_html_file(processed_df, executive_summary, strategic_insight, filename=dashboard_filename)

            print("\n" + "="*80)
            print("🎉 Process Complete! 🎉")
            print(f"✅ '{dashboard_filename}' has been created successfully.")
            print("You can now download it from the Colab file browser and upload it to your GitHub repository.")
        else:
            print("❌ No articles could be successfully processed after filtering.")

✅ Successfully configured Gemini API.
🚀 Initializing Dashboard...
📰 Fetching targeted, multilingual news...
   📡 Searching in Germany...
   📡 Searching in Sweden...
   📡 Searching in Netherlands...
   📡 Searching in United Kingdom...
   📡 Searching in Norway...
✅ Fetched 20 total articles.

🧠 Processing 20 articles (this will take a very long time due to API pauses)...
   -> Processing 'Erneuerbare Energien: Reizthema Windenergie - Have...' from Germany
      -> Translating from 'de'...
      -> Translating from 'de'...
   -> Processing 'Erneuerbare Energien: Kommen Windräder in die Gren...' from Germany
      -> Translating from 'de'...
      -> Translating from 'de'...
   -> Processing 'IRW-News: ACCESS Newswire: Jama Connect(R) erreich...' from Germany
      -> Translating from 'de'...
      -> Translating from 'de'...
   -> Processing 'Erneuerbare Energien: Habecks Windkraft-Gesetz ern...' from Germany
      -> Translating from 'de'...
      -> Translating from 'de'...
   -> Proces