# 99% Invisible Booklist

Author: Lindsey Viann Parkinson
Last updated: February 11, 2021


Scrapes the 99% Invisible podcast website, 99pi.org, and pulls information from the episodes that interview an author. Specifically honing in on "author of" in the episode description

In [32]:
import requests
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

# this is a fancy progress bar! works on jupyter notebook
from tqdm.notebook import tqdm

from time import sleep
from datetime import datetime

In [33]:
url = 'https://99percentinvisible.org/episodes/?view_option=list'
response = requests.get(url, timeout = 2)

response.status_code # 200 is good 

200

In [34]:
soup = BeautifulSoup(response.content, "html.parser")

In [35]:
one_episode = soup.find("article", class_ = "list-block post episode")


In [36]:
def get_episodes(soup):

    cols = ["date", "episode_number", "title", "episode_link"]
    df_page = pd.DataFrame(columns = cols)

    all_episodes = soup.find_all("article", class_ = "list-block post episode") # get from one_job_ad box
    for episode in all_episodes:
        
        episode_link = episode.find("h3", {"class": "list-title"}).find("a").get("href")
        
        span_list = []
        for span in episode.find_all("span"):
            span_list.append(span.text) 
        
        date = span_list[3]

        episode_number = span_list[2]

        title = episode.find("a",{"class": "play"}).get("title")

        
        df_page = df_page.append(
            {
                "date": date,
                "episode_number": episode_number,
                "title": title,
                "episode_link": episode_link
            },
            ignore_index=True,
        )
        
    return df_page


In [37]:
df_page = get_episodes(soup)

### get information for all episode pages

In [38]:
link_first_part = "https://99percentinvisible.org/episodes/page/"
link_2_part = "/?view_option=list"

In [39]:
max_pages = soup.find("a",{"class": "page-numbers"}).find_next_siblings("a")[-1].get("data-page-number")

In [40]:
#remove later to scrape all pages
max_pages = 4

#can make this better. Don't need a second time
cols = ["date", "episode_number", "title", "episode_link"]
df = pd.DataFrame(columns = cols)

for page in tqdm(range(1, int(max_pages) + 1)):
    """Make the urls dynamic"""
    url = (
        link_first_part
        + str(page)
        + link_2_part
    )
    response = requests.get(url, timeout=15)
    soup = BeautifulSoup(response.content, "html.parser")

    df_page = get_episodes(soup)  # function created earlier
    df = df.append(df_page, ignore_index=True)

    sleep(0.6) # to keep up with human speed we need to slow down program

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [41]:
df

Unnamed: 0,date,episode_number,title,episode_link
0,02.08.21,Episode 430,The Doom Boom,https://99percentinvisible.org/episode/the-doo...
1,02.02.21,Episode 429,Stuccoed in Time,https://99percentinvisible.org/episode/stuccoe...
2,01.26.21,Episode 428,Beneath the Skyway,https://99percentinvisible.org/episode/beneath...
3,01.19.21,Episode 427,Mini-Stories: Volume 11,https://99percentinvisible.org/episode/mini-st...
4,01.12.21,Episode 426,Mini-Stories: Volume 10,https://99percentinvisible.org/episode/mini-st...
...,...,...,...,...
115,12.18.18,Episode 333,Mini Stories: Volume 5,https://99percentinvisible.org/episode/mini-st...
116,12.11.18,Episode 332,The Accidental Room,https://99percentinvisible.org/episode/the-acc...
117,12.04.18,Episode 331,Oñate’s Foot,https://99percentinvisible.org/episode/onates-...
118,11.27.18,Episode 330,Raccoon Resistance,https://99percentinvisible.org/episode/raccoon...


### information from each link

In [42]:
episode_link = df["episode_link"][5]
episode_link

'https://99percentinvisible.org/episode/mini-stories-volume-9/'

In [43]:
response2 = requests.get(episode_link, timeout=15)
soup2 = BeautifulSoup(response2.content, "html.parser")
soup2

<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="#FFD300" name="theme-color">
<script>
  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

  ga('create', 'UA-18711487-1', 'auto');
  ga('create', 'UA-67181499-1', 'auto', 'b');

  ga('send', 'pageview');
  ga('b.send', 'pageview');
</script>
<meta content="December 22, 2020" name="pi_date"/>
<meta content="single" name="pi_type"/>
<meta content="history" name="pi_cat"/>
<script type="text/javascript">
  var _pi=_pi||{};_pi.cat="history",_pi.pageType="single",_pi.reftypes={search:"^https?://[a-z0-9]+[.](search.aol.com

In [44]:
description = soup2.find_all("div", class_="credit")

In [45]:
description

[]