In [37]:
import requests
from bs4 import BeautifulSoup
import csv
import json
from tqdm import tqdm
import os

API_KEY = "1acd1280048c408393bc1dfbb1ba9e7b"  # ← replace with your NewsAPI key
NEWSAPI_URL = "https://newsapi.org/v2/everything"
HEADERS = {"User-Agent": "Mozilla/5.0"}

def fetch_pc_metadata():
    """Fetch 100 PC‑Gaming articles’ metadata from Polygon via NewsAPI."""
    params = {
        "q": "Playstation",
        "sources": "polygon",
        "language": "en",
        "pageSize": 100,  # max 100 per request
        "page": 1,
        "sortBy": "publishedAt",
        "apiKey": API_KEY
    }
    resp = requests.get(NEWSAPI_URL, params=params, headers=HEADERS)
    resp.raise_for_status()
    return resp.json()["articles"]

def fetch_full_text(url):
    """Scrape the full article text from its URL."""
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Polygon article content lives under <div class="Article-body"> 
    container = soup.find("div", class_="Article-body") or soup.find("article")
    paras = container.find_all("p") if container else []
    return "\n".join(p.get_text(strip=True) for p in paras)

def scrape_and_save():
    os.makedirs("Data_csv", exist_ok=True)
    os.makedirs("Data_json", exist_ok=True)

    # 1) Fetch metadata
    articles = fetch_pc_metadata()

    # 2) Scrape full text
    results = []
    for idx, meta in enumerate(tqdm(articles, desc="Fetching full texts"), 1):
        url = meta.get("url")
        try:
            full_text = fetch_full_text(url)
        except Exception as e:
            print(f"[{idx}] Error fetching {url}: {e}")
            full_text = ""
        results.append({
            "title": meta.get("title", ""),
            "publishedAt": meta.get("publishedAt", ""),
            "url": url,
            "full_text": full_text
        })

    # 3a) Save CSV
    csv_path = "Data_csv/playstation_polygon_news.csv"
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "publishedAt", "url", "full_text"])
        writer.writeheader()
        for row in results:
            writer.writerow(row)
    print(f"Saved CSV → {csv_path}")

    # 3b) Save JSON
    json_path = "Data_json/playstation_polygon_news.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"Saved JSON → {json_path}")

if __name__ == "__main__":
    scrape_and_save()


Fetching full texts: 100%|██████████| 90/90 [00:05<00:00, 16.92it/s]

Saved CSV → Data_csv/playstation_polygon_news.csv
Saved JSON → Data_json/playstation_polygon_news.json



