<h1>ADS509 Final Project: News Source Classification and Topic Modeling</h1>

By Matt Ammirati

<h3> Project Overview </h3> - Predict whether an article comes from Fox news or CNN and analyze major discussion topics


In [2]:
import pandas as pd
import requests
from datetime import datetime, timedelta



In [1]:
import json
import os

# Load API key securely
config_path = os.path.join("..", "config.json")

with open(config_path) as f:
    api_key = json.load(f)["newsapi_key"]

print("API key loaded successfully.")


API key loaded successfully.


<h3>Data Collection</h3>

In [8]:
def fetch_articles(source, query=None, from_days=7, page_size=100, max_pages=3):
    """
    Fetch recent articles from a given source using NewsAPI.
    
    Args:
        source (str): The news source (e.g. 'cnn' or 'fox-news')
        query (str): Optional keyword to filter articles
        from_days (int): How many days back to pull articles
        page_size (int): Number of results per page (max 100)
        max_pages (int): How many pages to fetch
    """
    base_url = "https://newsapi.org/v2/everything"
    all_articles = []

    from_date = (datetime.now() - timedelta(days=from_days)).strftime("%Y-%m-%d")

    for page in range(1, max_pages + 1):
        params = {
            "sources": source,
            "q": query,
            "from": from_date,
            "pageSize": page_size,
            "page": page,
            "apiKey": api_key,
            "language": "en",
        }

        response = requests.get(base_url, params=params)
        data = response.json()

        if data.get("status") != "ok":
            print(f"Error fetching {source} page {page}: {data.get('message')}")
            break

        articles = data.get("articles", [])
        if not articles:
            break

        all_articles.extend(articles)

    print(f"Retrieved {len(all_articles)} articles from {source}.")
    return pd.DataFrame(all_articles)


In [9]:
cnn_df = fetch_articles("cnn", max_pages=3)
fox_df = fetch_articles("fox-news", max_pages=3)


Error fetching cnn page 2: You have requested too many results. Developer accounts are limited to a max of 100 results. You are trying to request results 100 to 200. Please upgrade to a paid plan if you need more results.
Retrieved 100 articles from cnn.
Error fetching fox-news page 2: You have requested too many results. Developer accounts are limited to a max of 100 results. You are trying to request results 100 to 200. Please upgrade to a paid plan if you need more results.
Retrieved 100 articles from fox-news.


<h3>Descriptive Statistics</h3>

In [10]:
# Combine both sources before cleaning for descriptive stats
cnn_df["label"] = "CNN"
fox_df["label"] = "Fox"
raw_df = pd.concat([cnn_df, fox_df], ignore_index=True)

# Basic counts
print("Total articles:", len(raw_df))
print("CNN articles:", len(cnn_df))
print("Fox News articles:", len(fox_df))

# Null values per column
print("\nMissing values per column:")
print(raw_df.isnull().sum())

# Combine text for simple length stats
raw_df["combined_text"] = (
    raw_df["title"].fillna("") + " " +
    raw_df["description"].fillna("") + " " +
    raw_df["content"].fillna("")
)

# Calculate article lengths
raw_df["word_count"] = raw_df["combined_text"].apply(lambda x: len(x.split()))

# Summary stats
print("\n--- Descriptive Stats ---")
print(raw_df["word_count"].describe())

# Grouped by source
print("\nAverage word count by source:")
print(raw_df.groupby("label")["word_count"].mean())

# Quick preview
raw_df[["label", "title", "word_count"]].head(10)


Total articles: 200
CNN articles: 100
Fox News articles: 100

Missing values per column:
source           0
author         101
title            0
description      0
url              0
urlToImage       0
publishedAt      0
content          0
label            0
dtype: int64

--- Descriptive Stats ---
count    200.000000
mean      75.475000
std        9.242646
min       46.000000
25%       68.750000
50%       74.000000
75%       81.250000
max      101.000000
Name: word_count, dtype: float64

Average word count by source:
label
CNN    79.15
Fox    71.80
Name: word_count, dtype: float64


Unnamed: 0,label,title,word_count
0,CNN,Live updates: Israeli hostage release from Gaz...,63
1,CNN,DEVELOPING: HOSTAGES RELEASED,72
2,CNN,Stock futures rise after Trump hints at backin...,80
3,CNN,"Mark Sanchez booked and released from custody,...",70
4,CNN,James Franklin fired as Penn State head coach ...,74
5,CNN,"Diane Keaton was a pioneer for modern women, b...",65
6,CNN,More than half of CDC staffers recently fired ...,76
7,CNN,Hamas asserts control in Gaza and targets alle...,75
8,CNN,"South Carolina bar shooting: 4 people killed, ...",73
9,CNN,Texas Tech head coach pleads with tortilla-thr...,78


<h3> Data Cleaning</h3>

<h3>EDA</h3>