<a href="https://colab.research.google.com/github/Hisernberg/Ai-agentic-projects/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Ollama
!curl -fsSL https://ollama.com/install.sh | sh
!nohup ollama serve > /dev/null 2>&1 &
!ollama pull phi4

# Install Chromium and Chromedriver for Selenium
!apt-get update -y
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

# Install required Python packages
!pip install selenium pandas


>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[

In [2]:
import os
import requests
import urllib.robotparser as robotparser
import xml.etree.ElementTree as ET
from urllib.parse import urljoin, urlparse
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

def ensure_folder_exists(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

def fetch_file(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def save_file(content, file_path):
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)
    except Exception as e:
        print(f"Error saving file {file_path}: {e}")

def parse_robots_txt(robots_url, base_url):
    sitemap_urls = set()
    robots_content = fetch_file(robots_url)
    if not robots_content:
        return sitemap_urls
    for line in robots_content.splitlines():
        if line.lower().startswith('sitemap:'):
            sitemap_url = line.split(':', 1)[1].strip()
            sitemap_urls.add(sitemap_url)
    return sitemap_urls

def parse_sitemap(sitemap_url, base_url):
    urls = set()
    sitemap_content = fetch_file(sitemap_url)
    if not sitemap_content:
        return urls
    try:
        root = ET.fromstring(sitemap_content)
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        if root.tag.endswith('sitemapindex'):
            for sitemap in root.findall('ns:sitemap/ns:loc', namespace):
                urls.update(parse_sitemap(sitemap.text.strip(), base_url))
        else:
            for url in root.findall('ns:url/ns:loc', namespace):
                full_url = url.text.strip()
                urls.add(full_url)
    except ET.ParseError:
        pass
    return urls


In [3]:
def scrape_page(url):
    print(f"Scraping: {url}")
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "article"))
        )
        articles = driver.find_elements(By.TAG_NAME, "article")
        data = []
        for article in articles:
            try:
                title = article.find_element(By.TAG_NAME, "h2").text
                summary = article.find_element(By.TAG_NAME, "p").text
                data.append({"title": title, "summary": summary, "url": url})
            except:
                continue
        return data
    except:
        return []
    finally:
        driver.quit()


In [None]:
def ask_phi4(query):
    import subprocess
    prompt = f"Here is a collection of news articles. Find or summarize content related to: {query}.\n\n"
    with open("news_data.csv", "r", encoding='utf-8') as f:
        prompt += f.read()
    result = subprocess.run(["ollama", "run", "phi4"], input=prompt.encode(), capture_output=True)
    print(result.stdout.decode())


In [4]:
def run_news_aggregator(site_url="https://svelte.dev"):
    base_url = site_url if site_url.startswith("http") else "https://" + site_url
    robots_url = urljoin(base_url, "/robots.txt")
    sitemap_urls = parse_robots_txt(robots_url, base_url)
    all_urls = set()
    for sitemap_url in sitemap_urls:
        all_urls.update(parse_sitemap(sitemap_url, base_url))

    # If no sitemap found, fallback to homepage
    if not all_urls:
        all_urls.add(base_url + "/blog")

    all_articles = []
    for url in list(all_urls)[:5]:  # Limit for demo
        articles = scrape_page(url)
        all_articles.extend(articles)

    df = pd.DataFrame(all_articles)
    df.to_csv("news_data.csv", index=False)
    print(" News articles saved to 'news_data.csv'")
    display(df)

    while True:
        query = input("Ask your question about the news (or type 'exit'): ")
        if query.lower() == 'exit':
            break
        ask_phi4(query)

# Run
run_news_aggregator("https://svelte.dev")


Scraping: https://svelte.dev/blog
✅ News articles saved to 'news_data.csv'


Unnamed: 0,title,summary,url
0,What’s new in Svelte: May 2025,"Svelte Summit soon! Plus, await in components",https://svelte.dev/blog
1,What’s new in Svelte: April 2025,"Writable $derived statements, async reroute an...",https://svelte.dev/blog
2,What’s new in Svelte: March 2025,Congrats to the SvelteHack winners! Plus impro...,https://svelte.dev/blog
3,What’s new in Svelte: February 2025,"New types, pnpm 10 support and better syntax h...",https://svelte.dev/blog
4,What’s new in Svelte: January 2025,"Svelte 5 just keeps getting better. Plus, an i...",https://svelte.dev/blog
...,...,...,...
79,Using CSS-in-JS with Svelte,"You don’t need to, but you can",https://svelte.dev/blog
80,Svelte v2 is out!,Here’s what you need to know,https://svelte.dev/blog
81,Sapper: Towards the ideal web app framework,Taking the next-plus-one step,https://svelte.dev/blog
82,The zen of Just Writing CSS,"I would say this is the future, but we’re alre...",https://svelte.dev/blog


KeyboardInterrupt: Interrupted by user