# 99% Invisible Booklist

Author: Lindsey Viann Parkinson
Last updated: February 10, 2021


Scrapes the 99% Invisible podcast website, 99pi.org, and pulls information from the episodes that interview an author. Specifically honing in on "author of" in the episode description

In [6]:
import requests
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

# this is a fancy progress bar! works on jupyter notebook
from tqdm.notebook import tqdm

from time import sleep
from datetime import datetime

In [8]:
url = 'https://99percentinvisible.org/episodes/?view_option=list'
response = requests.get(url, timeout = 2)

response.status_code # 200 is good 

200

In [75]:
soup = BeautifulSoup(response.content, "html.parser")

In [76]:
one_episode = soup.find("article", class_ = "list-block post episode")


In [50]:
def get_episodes(soup):

    cols = ["date", "episode_number", "title", "episode_link"]
    df_page = pd.DataFrame(columns = cols)

    all_episodes = soup.find_all("article", class_ = "list-block post episode") # get from one_job_ad box
    for episode in all_episodes:
        
        episode_link = episode.find("h3", {"class": "list-title"}).find("a").get("href")
        
        span_list = []
        for span in episode.find_all("span"):
            span_list.append(span.text) 
        
        date = span_list[3]

        episode_number = span_list[2]

        title = episode.find("a",{"class": "play"}).get("title")

        
        df_page = df_page.append(
            {
                "date": date,
                "episode_number": episode_number,
                "title": title,
                "episode_link": episode_link
            },
            ignore_index=True,
        )
        
    return df_page


In [51]:
df_page = get_episodes(soup)

### get information for all episode pages

In [82]:
link_first_part = "https://99percentinvisible.org/episodes/page/"
link_2_part = "/?view_option=list"

In [67]:
max_pages = soup.find("a",{"class": "page-numbers"}).find_next_siblings("a")[-1].get("data-page-number")

'16'

In [83]:
#remove later to scrape all pages
max_pages = 4

#can make this better. Don't need a second time
cols = ["date", "episode_number", "title", "episode_link"]
df = pd.DataFrame(columns = cols)

for page in tqdm(range(1, int(max_pages) + 1)):
    """Make the urls dynamic"""
    url = (
        link_first_part
        + str(page)
        + link_2_part
    )
    response = requests.get(url, timeout=15)
    soup = BeautifulSoup(response.content, "html.parser")

    df_page = get_episodes(soup)  # function created earlier
    df = df.append(df_page, ignore_index=True)

    sleep(0.6) # to keep up with human speed we need to slow down program

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [84]:
df

Unnamed: 0,date,episode_number,title,episode_link
0,02.08.21,Episode 430,The Doom Boom,https://99percentinvisible.org/episode/the-doo...
1,02.02.21,Episode 429,Stuccoed in Time,https://99percentinvisible.org/episode/stuccoe...
2,01.26.21,Episode 428,Beneath the Skyway,https://99percentinvisible.org/episode/beneath...
3,01.19.21,Episode 427,Mini-Stories: Volume 11,https://99percentinvisible.org/episode/mini-st...
4,01.12.21,Episode 426,Mini-Stories: Volume 10,https://99percentinvisible.org/episode/mini-st...
...,...,...,...,...
115,12.18.18,Episode 333,Mini Stories: Volume 5,https://99percentinvisible.org/episode/mini-st...
116,12.11.18,Episode 332,The Accidental Room,https://99percentinvisible.org/episode/the-acc...
117,12.04.18,Episode 331,Oñate’s Foot,https://99percentinvisible.org/episode/onates-...
118,11.27.18,Episode 330,Raccoon Resistance,https://99percentinvisible.org/episode/raccoon...
