In [1]:
import requests
from bs4 import BeautifulSoup
from ollama import Client
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import random
import time
import re


In [2]:
def summarize_news(url):
    # Load page using requests
    print('url',url)
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        return f"Failed to fetch page: {response.status_code}"

    # Parse with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    text = ' '.join([p.get_text() for p in paragraphs])

    if not text.strip():
        return "No readable text found — page might require JavaScript."

    # Summarize using Ollama (Mistral)
    client = Client()
    response = client.chat(
        model='mistral',
        messages=[{'role': 'user', 'content': f"Find out what news is covered in the major part of this article and summarize it in a long single paragraph simple language for anyone to understand.:\n\n{text[:3000]}"}]
    )

    print(response['message']['content'])

    return response['message']['content']


In [3]:
def get_news_links(url, keyword_filters=["news", "article", "story"]):
    try:
        time.sleep(random.uniform(3, 6))
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Failed to fetch {url} — Status: {response.status_code}")
            return []

        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)
        full_links = [urljoin(url, a['href']) for a in links]

        # Filter
        noise = ['login', 'subscribe', 'advert', 'cookie', 'feedback', '#']
        news_links = [
            link for link in full_links
            if any(k in link.lower() for k in keyword_filters)
            and not any(n in link.lower() for n in noise)
        ]

        return list(set(news_links))  # remove duplicates

    except Exception as e:
        print(f"Error: {e} : {url}")
        return []

In [4]:
visited_pages = set()
all_article_links = set()
all_pagination_links = set()
done_paginated = set()



In [5]:
def crawl_news_pages_iterative(base_url):
    to_visit = set([base_url])
    visited = set()
    articles = set()

    while to_visit:
        url = to_visit.pop()
        visited.add(url)

        links = get_news_links(url)
        filtered = [l for l in links if l.startswith(base_url) and l != base_url]

        paginated = {l for l in filtered if re.search(r'/page-\d+/?$', l)}
        article_links = set(filtered) - paginated

        articles = articles|article_links
        new_pages = paginated - visited
        to_visit.update(new_pages)
        #print("ToDo:",list(to_visit))

    return articles


In [6]:
base_url = "https://www.moneycontrol.com/news/business/economy/"
articles = crawl_news_pages_iterative(base_url)
# This is 10 times faster than selenium. But it may not work for all websites. It works for moneycontrol though.


In [7]:
articles

{'https://www.moneycontrol.com/news/business/economy/a-tenth-of-eu-exports-to-indonesia-vietnam-under-threat-as-us-seals-trade-deals-13290563.html',
 'https://www.moneycontrol.com/news/business/economy/adani-denies-doj-probe-over-iranian-lpg-shipments-how-are-india-s-energy-relations-with-tehran-13092336.html',
 'https://www.moneycontrol.com/news/business/economy/air-travel-between-non-metro-cities-may-fly-past-traffic-of-eight-metros-this-year-13095766.html',
 'https://www.moneycontrol.com/news/business/economy/amid-an-ongoing-tariff-war-it-s-not-all-roses-for-india-s-flower-exports-13000834.html',
 'https://www.moneycontrol.com/news/business/economy/amid-reciprocal-tariff-niti-aayog-suggests-dual-track-approach-in-india-us-trade-13091473.html',
 'https://www.moneycontrol.com/news/business/economy/amid-tariff-turmoil-fieo-says-businesses-targetting-export-of-goods-and-services-worth-1-trillion-in-fy26-13050864.html',
 'https://www.moneycontrol.com/news/business/economy/annual-mean-tem

In [None]:
def filter_links_by_keywords(links, keywords, mode="any"):
    # mode is either "any" or "all"
    matching_links = set()
    other_links = set()
    headers = {'User-Agent': 'Mozilla/5.0'}

    for link in links:
        try:
            response = requests.get(link, headers=headers, timeout=10)
            if response.status_code != 200:
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            text = ' '.join([p.get_text().lower() for p in soup.find_all('p')])

            if mode == "any" and any(k in text for k in keywords):
                matching_links.add(link)
            elif mode == "all" and all(k in text for k in keywords):
                matching_links.add(link)
            else:
                other_links.add(link)

        except Exception as e:
            print(f"Error with {link}: {e}")

    return matching_links, other_links


In [None]:
match, rest = filter_links_by_keywords(set(list(articles)[:5]), ["women"], mode="any")
# mode is either "any" or "all"

In [41]:
match

set()

In [42]:
len(articles)

344

In [43]:
def summarize_multiple_news(links):
    headers = {'User-Agent': 'Mozilla/5.0'}
    combined_text = ""

    for link in links:
        try:
            response = requests.get(link, headers=headers, timeout=10)
            if response.status_code != 200:
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            paragraphs = soup.find_all('p')
            text = ' '.join([p.get_text() for p in paragraphs])
            if text.strip():
                combined_text += f"\n\n--- ARTICLE: {link} ---\n{text}"
        except Exception as e:
            print(f"Error with {link}: {e}")

    if not combined_text.strip():
        return "No readable content found in the provided links."

    # Summarize using Ollama
    client = Client()
    response = client.chat(
        model='mistral',
        messages=[{
            'role': 'user',
            'content': f"Give slong summary for all the news articles below together in simple language for general understanding.:\n\n"
        }]
    )

    return response['message']['content']


In [44]:
summarize_multiple_news(match)

'No readable content found in the provided links.'

In [45]:
def interpret_prompt(prompt_text):
    client = Client()
    system_prompt = (
    "You are an AI that extracts search parameters from a user's question to help find news articles.\n"
    "Return a Python dictionary with the following keys:\n"
    "- 'keywords': important words to search\n"
    # "- 'date': the mentioned date range (format: 'Month YYYY') or None\n"
    "- 'match_mode': 'any' if at least one keyword is enough to match, or 'all' if all keywords must be present\n\n"
    "Rules for match_mode:\n"
    "- Use 'any' for open-ended or broad questions like: 'Tell me news about Tesla and Elon Musk'\n"
    "- Use 'all' if the user clearly asks for a **specific event or relationship** among the keywords, like: 'Why did HDFC stock fall?'\n\n"
    "Example 1:\n"
    "User: What happened with Infosys in June 2023?\n"
    "Output: {'keywords': ['infosys'], 'match_mode': 'any'}\n\n"
    "Example 2:\n"
    "User: Why did Adani stock fall after the budget?\n"
    "Output: {'keywords': ['adani', 'stock', 'fall', 'budget'], 'match_mode': 'all'}\n\n"
    "Now extract from the following:\n"
    )


    response = client.chat(
        model='mistral',
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt_text}
        ]
    )

    try:
        extracted = eval(response['message']['content'])
        return extracted
    except Exception as e:
        print(f"Error parsing response: {e}")
        return None


In [46]:
def get_news_summary_from_prompt(prompt_text):
    params = interpret_prompt(prompt_text)
    print("params:",params)
    if not params:
        return "Couldn't understand your request."

    base_url = "https://www.moneycontrol.com/news/business/economy/"
    article_links = crawl_news_pages_iterative(base_url)  # already built

    matching_links, _ = filter_links_by_keywords(
        set(list(article_links)[:10]),
        keywords=params['keywords'],
        mode=params['match_mode']
    )
    print("matching_article_links:",matching_links)
    if not matching_links:
        return "No matching articles found."

    summary = summarize_multiple_news(matching_links)
    return summary


In [47]:
# TRYING CACHE

In [48]:
import json
import os
import time

CACHE_FILE = "cached_articles.json"
CACHE_EXPIRY = 3 * 60 * 60  # 3 hours in seconds

def load_cached_links():
    if not os.path.exists(CACHE_FILE):
        return None
    with open(CACHE_FILE, 'r') as f:
        data = json.load(f)
    if time.time() - data['timestamp'] > CACHE_EXPIRY:
        return None
    return set(data['links'])

def save_links_to_cache(links):
    with open(CACHE_FILE, 'w') as f:
        json.dump({
            'timestamp': time.time(),
            'links': list(links)
        }, f)

def crawl_with_cache(base_url):
    cached = load_cached_links()
    if cached:
        print("Loaded article links from cache.")
        return cached
    print("No valid cache found. Crawling fresh...")
    links = crawl_news_pages_iterative(base_url)
    save_links_to_cache(links)
    return links


In [49]:
def get_news_summary_from_prompt(prompt_text):
    params = interpret_prompt(prompt_text)
    print("params:",params)
    if not params:
        return "Couldn't understand your request."

    base_url = "https://www.moneycontrol.com/news/business/economy/"
    article_links = crawl_with_cache(base_url)

    matching_links, _ = filter_links_by_keywords(
        set(list(article_links)[:25]),
        keywords=params['keywords'],
        mode=params['match_mode']
    )
    print("matching_article_links:",matching_links)
    if not matching_links:
        return "No matching articles found."

    if len(list(matching_links))>5:
        matching_links = set(list(matching_links)[:5])
        
    summary = ''
    for link in matching_links:
        print("1:",link)
        summary += summarize_news(link)
        summary += '\n\n\n\n\n'
    return summary


In [50]:
query = "Did any share go up?"
result = get_news_summary_from_prompt(query)
print("\n--- Summary ---\n")
print(result)

params: {'keywords': ['share', 'go up'], 'match_mode': 'any'}
No valid cache found. Crawling fresh...
matching_article_links: {'https://www.moneycontrol.com/news/business/economy/coal-production-rises-by-a-modest-3-63-percent-in-april-to-cross-81-million-tonne-13010315.html', 'https://www.moneycontrol.com/news/business/economy/india-stands-to-lose-5-billion-of-exports-to-vietnam-in-no-us-trade-deal-scenario-13238264.html', 'https://www.moneycontrol.com/news/business/economy/india-uk-fta-to-boost-market-access-signals-shift-to-bilateral-trade-sitharaman-13015333.html', 'https://www.moneycontrol.com/news/business/economy/rbi-moves-to-deepen-junk-debt-market-by-allowing-bad-loan-securitisation-12994417.html', 'https://www.moneycontrol.com/news/business/economy/natural-disasters-claim-over-3-000-lives-in-india-in-2024-25-highest-in-11-years-13099910.html', 'https://www.moneycontrol.com/news/business/economy/chinese-media-front-loads-pakistani-military-claims-and-civilian-casualties-1301547

In [51]:
print(result)

 This article covers two main topics: the latest developments in India's coal production and market updates on various stocks and indices.

In terms of coal production, India produced 81.57 million tonnes (MT) in April 2025, representing a 3.63 percent increase from the same period in 2024. The growth was significantly higher in captive and commercial mines, with a 26.6 percent rise from 11.46 MT to 14.51 MT. This surge is attributed to the efforts of the Ministry of Coal and its subsidiaries in ensuring a consistent supply and operational stability in the sector. The total coal dispatch also saw a steady increase, reaching 86.64 MT compared to 85.11 MT in April last year. The coal stock held by coal companies significantly increased to 125.76 MT in April, up from 102.41 MT the previous year. This indicates an annual growth rate of 22.8 percent, reflecting the robust performance and efficiency of the coal sector.

The second part of the article focuses on market updates. It provides in