In [69]:
import time
import duckdb
import requests
from bs4 import BeautifulSoup

# Data prep

In [2]:
con = duckdb.connect(':memory:')

In [18]:
q = """
WITH unique_heds_per_test AS (
SELECT
    clickability_test_id,
    COUNT(DISTINCT headline) AS num_headlines
FROM
    'data/upworthy_exploratory.csv'
GROUP BY 
    1
),
multi AS (
SELECT clickability_test_id FROM unique_heds_per_test WHERE num_headlines > 1
)
SELECT
    clickability_test_id,
    headline,
    slug,
    first_place
FROM
    'data/upworthy_exploratory.csv'
WHERE
    clickability_test_id IN (SELECT * FROM multi)
"""

df = con.execute(q).fetchdf()

In [43]:
df["slug_truncated"] = df["slug"].apply(lambda x: "-".join(x.split("-")[:-2]))

In [45]:
df["slug_pairs"] = df.apply(lambda x: (x["slug_truncated"], x["first_place"]), axis=1)

# Find Wayback records

In [75]:
r = requests.get(
    "http://web.archive.org/cdx/search/cdx", 
    params={
        "url": "https://www.upworthy.com/45-seconds-in-and-suddenly-im-doubting-everything-i-thought-i-knew-about-the-holidays",
        "output": "json"
    },
    headers=
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
)

In [79]:
r_json = r.json()

In [80]:
url_wayback = f"https://web.archive.org/web/{r_json[1][1]}/{r_json[1][2]}"

In [82]:
# check response for missing record

In [None]:
def check_wayback_archive(urls, delay=1):
    archived_urls = []
    for url in urls:
        try:
            # Send request to Wayback Machine CDX API
            response = requests.get(
                "http://web.archive.org/cdx/search/cdx",
                params={"url": url, "output": "json"}
            )
            response.raise_for_status()  # Check for HTTP errors
            
            data = response.json()
            # Check if there are any snapshots (response isn't empty)
            if len(data) > 0:
                archived_urls.append(url)
                print(f"Archived: {url}")
            else:
                print(f"Not archived: {url}")
            
            time.sleep(delay)  # Respectful delay between requests
        except requests.exceptions.RequestException as e:
            print(f"Error checking {url}: {e}")
        except ValueError as e:
            print(f"Invalid JSON response for {url}: {e}")
    return archived_urls

# Parse body text

In [None]:
def get_body_text(slug):
    url = f""

In [52]:
r = requests.get("https://web.archive.org/web/20240920135734/https://www.upworthy.com/45-seconds-in-and-suddenly-im-doubting-everything-i-thought-i-knew-about-the-holidays")

In [67]:
soup = BeautifulSoup(r.text, "html.parser")
body = soup.find_all("div", {"class": "widget__body"})[1]
h_tags = body.find_all(["h2", "h3", "h4", "h5", "h6"])
p_tags = body.find_all("p")
all_tags = h_tags + p_tags
text = " ".join([tag.text for tag in all_tags])

In [68]:
print(text)

The fact is, everyone thinks they know why we celebrate Christmas. And nearly everyone is wrong. The history of Christmas is as varied and eclectic as the history of America. It's a blend of lots of different holidays and traditions. The real War on Christmas is being waged by those who want to limit it to Bing Crosby crooning to Baby Jesus. Let's stop the war. Merry Holidays, everyone!  Did you know that early Christians co-opted the Roman holiday of Saturnalia, a time of government-sanctioned, destructive, naked lawlessness? No joke! And that we have Christmas trees because an ancient cult liked to worship them?   So yeah, there is a "War on Christmas." And it's being waged by everyone who wants you to forget that this is a time for embracing all kinds of traditions. The real meaning of Christmas?  We all can celebrate together because winter is a challenging and dark time. Everyone needs an excuse for a party.  Whether you're a Catholic heading out for midnight Mass or a Jew getting

# Format for DPO