In [5]:
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("API_KEY")

In [8]:
# Import libraries
import pandas as pd
import requests as req
import time
import glob

**Scraping Data Script**

We plan to scrape NYT headlines using the Article Search API from NYT developers. Since our Kaggle dataset contains years 1980-2020, we are looking to supplement that dataset by fetching years 2020-2024.

In [None]:
TOPIC='headline'
url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'

def get_articles(year):
  results = []
  print(f"Fetching Year {year}")

  # Fetch paginated articles
  for i in range(0, 1000, 5):
    print("Getting page {num}".format(num=i))
    response = req.get(url, params={
        "begin_date": f"{year}0101",
        "end_date": f"{year}1231",
        "api-key": api_key,
        "page": i
    })

    # Unable to fetch anymore
    if response.status_code != 200:
      print("fail")
      break

    # Add to results
    data = response.json()
    if data["status"] == "OK":
      docs = data["response"]["docs"]
      for doc in docs:
        results.append({
          "headline": doc["headline"]["main"], 
          "pub_date": doc["pub_date"], 
          "url": doc["web_url"], 
          "word_count": doc["word_count"],
          "news_desk": doc["news_desk"],
          "source": doc["source"],
        })
    time.sleep(15)

    # Save DataFrame to CSV
    df = pd.DataFrame(results)
    output_path = f"data/nyt_headlines_{year}.csv"
    df.to_csv(output_path, index=False, encoding='utf-8')


In [None]:
# Supplement with years 2020-2024
for year in range(2020, 2025):
    get_articles(year)

In [9]:
file_paths = glob.glob("data/nyt_headlines_*.csv")
data_frames = []

for file_path in file_paths:
    df = pd.read_csv(file_path)
    data_frames.append(df)

nyt_headlines = pd.concat(data_frames, ignore_index=True)
nyt_headlines.to_csv("data/nyt_headlines_supplement.csv", index=False, encoding='utf-8')

display(nyt_headlines.head())


Unnamed: 0,headline,pub_date,url,word_count,news_desk,source
0,Covid Forces Families to Rethink Nursing Home ...,2021-05-06T14:44:10+0000,https://www.nytimes.com/2021/05/06/health/covi...,1528,Science,The New York Times
1,Vita Vea Is a Defensive Minimalist With Maximu...,2021-10-07T16:36:11+0000,https://www.nytimes.com/2021/10/07/sports/foot...,846,Sports,The New York Times
2,Before a ‘Classy Dinosaur’ Wedding Came Leukem...,2021-09-10T09:00:30+0000,https://www.nytimes.com/2021/09/10/style/kaitl...,1511,Styles,The New York Times
3,Is New York City Ready for the Omicron Variant?,2021-12-01T23:00:11+0000,https://www.nytimes.com/2021/12/01/nyregion/ne...,1408,Metro,The New York Times
4,"Ganga Stone, Who Gave Sustenance to AIDS Patie...",2021-06-04T22:05:16+0000,https://www.nytimes.com/2021/06/04/nyregion/ga...,843,Obits,The New York Times
