# Scrape Headlines

In [20]:
# Imports

from bs4 import BeautifulSoup
import requests

In [2]:
# Helper Function

def get_text(html_element):
    title_and_summary_tag = html_element.find_all('p')

    if len(title_and_summary_tag) == 0: return None
    
    if len(title_and_summary_tag) < 2: # This function is not very robust :(
        return title_and_summary_tag[0].text
        
    title   = title_and_summary_tag[0].text
    summary = title_and_summary_tag[1].text
    
    title_and_summary = title + ". " + summary
    title_and_summary

    return title_and_summary

In [3]:
# Chicago

# Request from Chicago Tribune
response = requests.get('https://www.chicagotribune.com/')

# Get HTML from response
html = BeautifulSoup(response.text)

# Find all article titles in HTML
headlines = [headline['title'] for headline in html.find_all(class_="article-title")]

# Print
headlines[:5]

['Gov. JB Pritzker’s running mate is a fellow Chicagoan, but says he will ‘represent all of Illinois’',
 'It’s been 30 years since the Grateful Dead’s final concerts at Soldier Field in Chicago: Their top 10 Chicago shows',
 'NASCAR Chicago Street Race puts city — in a seemingly endless loop through the Loop —\xa0back on national stage',
 'Fireworks 2025: All the Fourth of July shows in the Chicago area',
 'Asking-Eric']

In [4]:
# NY Times

# Request from Chicago Tribune
response = requests.get('https://www.nytimes.com/')

# Get HTML from response
html = BeautifulSoup(response.text)

# Find all article titles in HTML
headlines = [get_text(headline) for headline in html.find_all(class_="story-wrapper")]

# Print
headlines[:5]

['House Passes Sweeping Bill to Fulfill Trump’s Domestic Agenda',
 None,
 'LIVE. After Razor-Thin Vote, Measure Goes to President for His Signature',
 'Needed to win. Needed to win',
 'Jeffries, Breaking Record in Hourslong Speech, Has His Moment. 3 min read']

In [5]:
# LA Times

# Request from LA Times
response = requests.get('https://www.latimes.com/')

# Get HTML from response
html = BeautifulSoup(response.text, 'html.parser')

# Find all article titles in HTML
headlines = [headline['aria-label'].strip() for headline in html.find_all('a', class_='link promo-placeholder') if 'aria-label' in headline.attrs]

# Print top 5 headlines
headlines[:5]

[]

In [6]:
response = requests.get('https://www.latimes.com/')
print(response.text[:500])

Access to this site has been denied.


# Write headlines to a .txt file

In [7]:
# Imports

import datetime

In [8]:
# Helper functions

TODAY = datetime.datetime.today().strftime('%Y-%m-%d')

In [9]:
filename = f"headlines_chicagotribune_{TODAY}.txt"
filename

'headlines_chicagotribune_2025-07-03.txt'

In [10]:
filename = f"headlines_nyt_{TODAY}.txt"
filename

'headlines_nyt_2025-07-03.txt'

In [11]:
with open(filename, 'w', encoding='utf-8') as output_file:
    for headline in headlines:
        if headline is None: continue
        output_file.write(headline + '\n')

# Run Model
* Load Trained Model
* Read .txt headlines
* Encode into .npy file
* Predict Scores
* Output Results

In [12]:
# Imports

import numpy as np
import joblib
from sentence_transformers import SentenceTransformer

In [13]:
clf = joblib.load('svm.joblib')
model = SentenceTransformer("all-MiniLM-L6-v2")

In [14]:
# Test

embedding = model.encode("Everything is terrible")
embedding.shape

(384,)

In [15]:
# Test 

clf.predict([embedding])

array(['Neutral'], dtype=object)

In [16]:
# Test

with open('headlines_chicagotribune_2024-12-01.txt', 'r') as file:
    headlines = file.readlines()
    print(headlines[:5])

['IHSA state football playoffs: See who won the championship games, from 8A to 1A\n', 'Chicagoâ€™s winter parking ban goes into effect Sunday. Hereâ€™s what to know, snow or no snow.\n', 'Matt Eberflusâ€™ Chicago Bears timeline: 32 losses, multiple coach firings and too many late-game missteps\n', '10 best books of 2024: The surprising reads that stuck\n', 'Asking-Eric\n']


In [17]:
# Testing on existing file

# Load the embeddings
embeddings = np.load('headlines_chicagotribune_2024-12-01.npy')

# Load the original headlines (needed to print alongside predictions)
with open('headlines_chicagotribune_2024-12-01.txt', 'r') as file:
    headlines = [line.strip() for line in file.readlines()]

# Run predictions
predictions = clf.predict(embeddings)

# Print predictions with corresponding headlines
for pred, headline in zip(predictions, headlines):
    print(f"{pred}, {headline}")

Optimistic, IHSA state football playoffs: See who won the championship games, from 8A to 1A
Neutral, Chicagoâ€™s winter parking ban goes into effect Sunday. Hereâ€™s what to know, snow or no snow.
Neutral, Matt Eberflusâ€™ Chicago Bears timeline: 32 losses, multiple coach firings and too many late-game missteps
Neutral, 10 best books of 2024: The surprising reads that stuck
Pessimistic, Asking-Eric
Pessimistic, Today-in-History
Optimistic, Many Illinois health systems provide gender-affirming care. What happens when Donald Trump becomes president?
Pessimistic, â€˜Heâ€™s gonna benefit from being with the Speakerâ€™: Madigan trial offers inside view of private dealmaking
Optimistic, US Senate inquiry into Chicagoâ€™s housing of migrants at airports likely to heat up after Republican election wins
Pessimistic, The Nation of Islam flourished in Chicago after Elijah Muhammad took over from the movementâ€™s founder
Neutral, Running the ball. Catching the ball. Drew MacPherson does it all as 

# Get versions for requirements.txt file

In [18]:
import numpy
import joblib
import sentence_transformers
import sklearn

print("numpy:", numpy.__version__)
print("joblib:", joblib.__version__)
print("sentence-transformers:", sentence_transformers.__version__)
print("scikit-learn:", sklearn.__version__)

numpy: 1.26.4
joblib: 1.5.1
sentence-transformers: 4.1.0
scikit-learn: 1.5.1


In [22]:
import bs4
import requests

print("beautifulsoup4:", bs4.__version__)
print("requests:", requests.__version__)

beautifulsoup4: 4.12.3
requests: 2.32.3
