In [4]:
%%pyspark
%pip install beautifulsoup4 requests azure-storage-blob lxml

In [14]:
%%pyspark
import requests
from bs4 import BeautifulSoup
from azure.storage.blob import BlobServiceClient
import json
from datetime import datetime
import time
import yfinance as yf

In [15]:
conn_str = "DefaultEndpointsProtocol=https;AccountName=ba870container;AccountKey= key;EndpointSuffix=core.windows.net"
blob_service_client = BlobServiceClient.from_connection_string(conn_str)
START_DATE = datetime(2021, 12, 27)
END_DATE   = datetime(2024, 9, 5)
container_name = "bronze"
blob_name = "yfinance_filtered_tickers.txt"

blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
content = blob_client.download_blob().readall().decode("utf-8")

# Parse tickers into a list
tickers = [line.strip() for line in content.splitlines() if line.strip()]
print("Loaded", len(tickers), "tickers")

In [16]:
# ----- MAP TICKERS TO COMPANY NAMES -----
ticker_to_name = {}
for ticker in tickers:
    try:
        company_name = yf.Ticker(ticker).info.get("shortName", None)
        if company_name:
            ticker_to_name[ticker] = company_name
            print(f"{ticker} → {company_name}")
        time.sleep(0.5)
    except Exception as e:
        print(f"⚠️ Failed to get company name for {ticker}: {e}")

In [17]:
def scrape_gnews_by_name(company_name, ticker, api_key):
    base_url = "https://gnews.io/api/v4/search"
    
    params = {
        "q": company_name,
        "lang": "en",
        "from": START_DATE.strftime("%Y-%m-%d"),
        "to": END_DATE.strftime("%Y-%m-%d"),
        "token": api_key,
        "max": 100  # GNews free tier returns up to 100 articles per request
    }

    response = requests.get(base_url, params=params)
    filtered_news = []

    try:
        results = response.json().get("articles", [])

        for item in results:
            # Parse and filter again just to be safe
            try:
                published_str = item.get("publishedAt")
                if published_str:
                    news_date = datetime.strptime(published_str, "%Y-%m-%dT%H:%M:%SZ")
                    if START_DATE <= news_date <= END_DATE:
                        filtered_news.append({
                            "company": company_name,
                            "ticker": ticker,
                            "headline": item.get("title"),
                            "link": item.get("url"),
                            "timestamp": news_date.isoformat(),
                            "source": item.get("source", {}).get("name")
                        })
            except:
                continue

    except Exception as e:
        print(f"❌ Error parsing GNews response for {company_name}: {e}")

    return filtered_news

In [18]:
GNEWS_API_KEY = "08163c8807e33eb3b78e4c17a235b108"
for ticker, name in ticker_to_name.items():
    try:
        news_items = scrape_gnews_by_name(name, ticker, GNEWS_API_KEY)

        if news_items:
            now = datetime.now().strftime("%Y%m%d_%H%M%S")
            blob_name = f"news/{ticker}/{ticker}_gnews_filtered_{now}.json"

            json_string = json.dumps(news_items)
            output_blob_client = blob_service_client.get_blob_client(container=NEWS_OUTPUT_CONTAINER, blob=blob_name)
            output_blob_client.upload_blob(json_string, overwrite=True)

            print(f"✅ Uploaded {len(news_items)} articles for {ticker} ({name})")

        else:
            print(f"⚠️ No articles in range for {ticker}:({name})")

        time.sleep(1.5)  # to avoid throttling by GNews

    except Exception as e:
        print(f"❌ Failed to upload for {ticker}: {e}")