# 99% Invisible Booklist

Author: Lindsey Viann Parkinson
Last updated: February 10, 2021


Scrapes the 99% Invisible podcast website, 99pi.org, and pulls information from the episodes that interview an author. Specifically honing in on "author of" in the episode description

In [6]:
import requests
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

# this is a fancy progress bar! works on jupyter notebook
from tqdm.notebook import tqdm

from time import sleep
from datetime import datetime

In [8]:
url = 'https://99percentinvisible.org/episodes/?view_option=list'
response = requests.get(url, timeout = 2)

response.status_code # 200 is good 

200

In [10]:
soup = BeautifulSoup(response.content, "html.parser")



In [19]:
one_episode = soup.find("article", class_ = "list-block post episode")
one_episode

<article class="list-block post episode">
<div>
<a class="play" data-episode="the-doom-boom" data-episode-number="430" data-thumb="https://99percentinvisible.org/app/uploads/2021/02/1599px-Schindler-Bunker_12883100793-300x200.jpg" href="#play" title="The Doom Boom">
<span class="image" style="background-image: url('https://99percentinvisible.org/app/uploads/2021/02/1599px-Schindler-Bunker_12883100793-300x200.jpg');">
<span class="play-button">
<i class="fa fa-play"></i>
</span>
</span>
</a>
<div class="text">
<h4 class="list-meta">
<span>Episode 430</span>
<a href="/category/architecture">
              Architecture            </a>
<span>02.08.21</span>
</h4>
<h3 class="list-title"><a href="https://99percentinvisible.org/episode/the-doom-boom/">The Doom Boom</a></h3>
</div>
<div class="post-actions">
<ul class="action-row" role="list"><li><a class="queue" data-episode="the-doom-boom" data-episode-number="430" data-thumb="https://99percentinvisible.org/app/uploads/2021/02/1599px-Schindl

In [26]:
one_episode.find("h3", {"class": "list-title"}).find("a").get("href")
#soup.find("div", {"class": "class_value" , "id":"id_value"}

'https://99percentinvisible.org/episode/the-doom-boom/'

In [45]:
one_episode.find("a",{"class": "play"}).get("title")

'The Doom Boom'

In [43]:
span_list = []
for span in one_episode.find_all("span"):
    span_list.append(span.text)
span_list


['\n\n\n\n',
 '\n\n',
 'Episode 430',
 '02.08.21',
 'Add to Queue',
 'Remove from Queue',
 'Download']

In [50]:
def get_episodes(soup):

    cols = ["date", "episode_number", "title", "episode_link"]
    df_page = pd.DataFrame(columns = cols)

    all_episodes = soup.find_all("article", class_ = "list-block post episode") # get from one_job_ad box
    for episode in all_episodes:
        
        episode_link = episode.find("h3", {"class": "list-title"}).find("a").get("href")
        
        span_list = []
        for span in episode.find_all("span"):
            span_list.append(span.text) 
        
        date = span_list[3]

        episode_number = span_list[2]

        title = episode.find("a",{"class": "play"}).get("title")

        
        df_page = df_page.append(
            {
                "date": date,
                "episode_number": episode_number,
                "title": title,
                "episode_link": episode_link
            },
            ignore_index=True,
        )
        
    return df_page


In [51]:
df_page = get_episodes(soup)

In [54]:
df_page.shape

(30, 4)

In [None]:
https://99percentinvisible.org/episodes/?view_option=list
https://99percentinvisible.org/episodes/page/2/?view_option=list
https://99percentinvisible.org/episodes/page/3/?view_option=list