# 99% Invisible Booklist

Author: Lindsey Viann Parkinson
Last updated: February 11, 2021


Scrapes the 99% Invisible podcast website, 99pi.org, and pulls information from the episodes that interview an author. Specifically honing in on "author of" in the episode description

In [1]:
import requests
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

# this is a fancy progress bar! works on jupyter notebook
from tqdm.notebook import tqdm

from time import sleep
from datetime import datetime

In [2]:
url = 'https://99percentinvisible.org/episodes/?view_option=list'
response = requests.get(url, timeout = 2)

response.status_code # 200 is good 

200

In [3]:
soup = BeautifulSoup(response.content, "html.parser")

In [42]:
one_episode = soup.find("article", class_ = "list-block post episode")

In [5]:
def get_episodes(soup):

    cols = ["date", "episode_number", "title", "episode_link"]
    df_page = pd.DataFrame(columns = cols)

    all_episodes = soup.find_all("article", class_ = "list-block post episode") # get from one_job_ad box
    for episode in all_episodes:
        
        episode_link = episode.find("h3", {"class": "list-title"}).find("a").get("href")
        
        span_list = []
        for span in episode.find_all("span"):
            span_list.append(span.text) 
        
        date = span_list[3]

        episode_number = span_list[2]

        title = episode.find("a",{"class": "play"}).get("title")

        
        df_page = df_page.append(
            {
                "date": date,
                "episode_number": episode_number,
                "title": title,
                "episode_link": episode_link
            },
            ignore_index=True,
        )
        
    return df_page


In [6]:
df_page = get_episodes(soup)

### get information for all episode pages

In [7]:
link_first_part = "https://99percentinvisible.org/episodes/page/"
link_2_part = "/?view_option=list"

In [8]:
max_pages = soup.find("a",{"class": "page-numbers"}).find_next_siblings("a")[-1].get("data-page-number")

In [9]:
#remove later to scrape all pages
max_pages = 4

#can make this better. Don't need a second time
cols = ["date", "episode_number", "title", "episode_link"]
df = pd.DataFrame(columns = cols)

for page in tqdm(range(1, int(max_pages) + 1)):
    """Make the urls dynamic"""
    url = (
        link_first_part
        + str(page)
        + link_2_part
    )
    response = requests.get(url, timeout=15)
    soup = BeautifulSoup(response.content, "html.parser")

    df_page = get_episodes(soup)  # function created earlier
    df = df.append(df_page, ignore_index=True)

    sleep(0.6) # to keep up with human speed we need to slow down program

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [10]:
df

Unnamed: 0,date,episode_number,title,episode_link
0,02.16.21,Episode 431,12 Heads from the Garden of Perfect Brightness,https://99percentinvisible.org/episode/12-head...
1,02.08.21,Episode 430,The Doom Boom,https://99percentinvisible.org/episode/the-doo...
2,02.02.21,Episode 429,Stuccoed in Time,https://99percentinvisible.org/episode/stuccoe...
3,01.26.21,Episode 428,Beneath the Skyway,https://99percentinvisible.org/episode/beneath...
4,01.19.21,Episode 427,Mini-Stories: Volume 11,https://99percentinvisible.org/episode/mini-st...
...,...,...,...,...
115,12.24.18,Episode 334,Christmas with The Allusionist,https://99percentinvisible.org/episode/christm...
116,12.18.18,Episode 333,Mini Stories: Volume 5,https://99percentinvisible.org/episode/mini-st...
117,12.11.18,Episode 332,The Accidental Room,https://99percentinvisible.org/episode/the-acc...
118,12.04.18,Episode 331,Oñate’s Foot,https://99percentinvisible.org/episode/onates-...


### information from each link

In [31]:
episode_link = df["episode_link"][0]
episode_link

'https://99percentinvisible.org/episode/12-heads-from-the-garden-of-perfect-brightness/'

In [41]:
response2 = requests.get(episode_link, timeout=15)
soup2 = BeautifulSoup(response2.content, "html.parser")


In [33]:
footer = soup2.find("footer")

In [40]:
description = footer.find("div", {"class": "credit"}).find("p").contents

['Producer Vivian Le spoke with Ai Weiwei, artist and activist; Patricia Yu, a doctoral candidate in the History of Art at UC Berkeley; Frederick Green, Associate Professor of Chinese and San Francisco State University; Zheng Wang, professor at the School of Diplomacy and International Relations at Seton Hall University and author of ',
 <em>Never Forget National Humiliation; </em>,
 'Lark Mason, Chinese art and antiquities specialist; Audrey Wang, historian of Chinese Art and author of ',
 <em>Chinese Antiquities: An Introduction to the Art Market. </em>,
 'Translation and production assistance by Wenjie Yang.']

In [51]:
def get_description(df):
    description = []
    for link in tqdm(df["episode_link"]):
        response2 = requests.get(link, timeout=15)
        soup2 = BeautifulSoup(response2.content, "html.parser")
        
        try: 
            footer = soup2.find("footer")
            desc = footer.find("div", {"class": "credit"}).find("p").contents
            description.append(desc)
        except:
            description.append('NA')
    
    df["description"] = description
    return df

In [52]:
get_description(df)

HBox(children=(FloatProgress(value=0.0, max=120.0), HTML(value='')))




Unnamed: 0,date,episode_number,title,episode_link,description
0,02.16.21,Episode 431,12 Heads from the Garden of Perfect Brightness,https://99percentinvisible.org/episode/12-head...,"[Producer Vivian Le spoke with Ai Weiwei, arti..."
1,02.08.21,Episode 430,The Doom Boom,https://99percentinvisible.org/episode/the-doo...,"[Host Roman Mars spoke with Bradley Garrett, a..."
2,02.02.21,Episode 429,Stuccoed in Time,https://99percentinvisible.org/episode/stuccoe...,[Delaney Hall spoke with architect Trey Jordan...
3,01.26.21,Episode 428,Beneath the Skyway,https://99percentinvisible.org/episode/beneath...,[Reporter Katie Thornton spoke with Bill Linde...
4,01.19.21,Episode 427,Mini-Stories: Volume 11,https://99percentinvisible.org/episode/mini-st...,
...,...,...,...,...,...
115,12.24.18,Episode 334,Christmas with The Allusionist,https://99percentinvisible.org/episode/christm...,
116,12.18.18,Episode 333,Mini Stories: Volume 5,https://99percentinvisible.org/episode/mini-st...,
117,12.11.18,Episode 332,The Accidental Room,https://99percentinvisible.org/episode/the-acc...,"[This piece was reported by , [Vanessa Lowe], ..."
118,12.04.18,Episode 331,Oñate’s Foot,https://99percentinvisible.org/episode/onates-...,"[Reporter Stan Alcorn spoke with , [Larry Call..."


In [55]:
df['description'][117]

['This piece was reported by ',
 <a href="https://twitter.com/nesslowe?lang=en" rel="noopener noreferrer" target="_blank">Vanessa Lowe</a>,
 ' and edited for\xa0',
 <em>99% Invisible</em>,
 '\xa0by producer Joe Rosenberg. A\xa0different version\xa0originally aired on the podcast\xa0 ',
 <a href="https://nocturnepodcast.org/" rel="noopener noreferrer" target="_blank">Nocturne</a>,
 '.']