In [20]:
#import modules
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [53]:
import requests

my_url= "https://finance.yahoo.com/news?hl=en-US&guccounter=1" 
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/114.0.0.0 Safari/537.36"
}

response = requests.get(my_url, headers=headers)

print("response.ok : {} , response.status_code : {}".format(response.ok, response.status_code))
print("Final URL after redirects (if any):", response.url)
print("Preview of response.text : ", response.text[:500])


response.ok : True , response.status_code : 200
Final URL after redirects (if any): https://finance.yahoo.com/news?hl=en-US&guccounter=1
Preview of response.text :  <!doctype html>
<html lang="en-US" theme="auto" data-color-theme-enabled="true" data-color-scheme="auto" class="desktop neo-green dock-upscale">
    <head>
        <meta charset="utf-8" />
        <meta name="oath:guce:consent-host" content="guce.yahoo.com" />
        <link rel="preconnect" href="//s.yimg.com" crossorigin="anonymous"><link rel="preconnect" href="//geo.yahoo.com"/><link rel="preconnect" href="//query1.finance.yahoo.com"/><link rel="preconnect" href="//query2.finance.yahoo.com"/><


In [2]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import time

def extract_article_text(url, headers):
    try:
        article_res = requests.get(url, headers=headers, timeout=10)
        if article_res.ok:
            article_soup = BeautifulSoup(article_res.text, 'html.parser')
            paragraphs = article_soup.find_all('p')
            article_text = " ".join(p.get_text(strip=True) for p in paragraphs)
            return article_text[:500]
        else:
            return f"Failed to fetch article: {article_res.status_code}"
    except Exception as e:
        return f"Error: {str(e)}"

def main():
    print("What topics are you interested in? (e.g., oil, inflation, interest rates)")
    raw_input = input("Enter keywords separated by commas: ")
    keywords = [k.strip().lower() for k in raw_input.split(',') if k.strip()]

    if not keywords:
        print("No keywords entered. Exiting.")
        return

    url = "https://finance.yahoo.com/news?hl=en-US"
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Accept-Language": "en-US,en;q=0.9"
    }

    print("\n Scraping Yahoo Finance News... please wait\n")
    response = requests.get(url, headers=headers)
    if not response.ok:
        print(f"❌ Request failed: {response.status_code}")
        return

    soup = BeautifulSoup(response.text, "html.parser")
    links = soup.find_all("a", href=True)

    today = datetime.today().strftime("%Y-%m-%d")
    filename = f"yahoo_filtered_news_{today}.csv"
    matches = 0

    with open(filename, "w", newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Date", "Title", "Link", "Preview"])

        for link in links:
            text = link.get_text(strip=True).lower()
            href = link["href"]

            if not text or len(text) < 20:
                continue

            if href.startswith("/"):
                full_url = "https://finance.yahoo.com" + href
            elif href.startswith("http"):
                full_url = href
            else:
                continue

            if any(k in text for k in keywords):
                print(f"✅ MATCH: {text}")
                preview = extract_article_text(full_url, headers)
                writer.writerow([today, text.capitalize(), full_url, preview])
                matches += 1
                time.sleep(1)

    if matches:
        print(f"\n✅ {matches} articles saved to '{filename}'")
    else:
        print("\nNo matching articles found. Try different keywords.")

if __name__ == "__main__":
    main()


💬 What topics are you interested in? (e.g., oil, inflation, interest rates)


🔎 Enter keywords separated by commas:  Stocks, Trump, Tariff, China



📡 Scraping Yahoo Finance News...

✅ MATCH: stocks: most actives
✅ MATCH: stocks: most actives
✅ MATCH: verve therapeutics skyrockets — pulling gene-editing stocks higher — on $1.3 billion eli lilly takeover
✅ MATCH: navigating now: how parents can shield their families from tariff-driven inflation
✅ MATCH: navigating now: how to protect your small business from the latest round of tariffs

✅ 5 articles saved to 'yahoo_filtered_news_2025-06-17.csv'
