In [1]:
"""
99% Invisible Booklist
Author: Lindsey Viann Parkinson Last updated: February 11, 2021
Scrapes the 99% Invisible podcast website, 99pi.org, and pulls information from the episodes that interview an author. Specifically honing in on "author of" in the episode description
"""
# Packages
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm # this is a fancy progress bar! works on jupyter notebook
from time import sleep
from datetime import datetime



#url = 'https://99percentinvisible.org/episodes/?view_option=list'


def get_episodes(url):
    """
    Returns episode information from one page of 99pi episodes list
    url = 99pi url of episode list
    """

    print(f"^ scraping basic episode information. 20 episodes per page")
    response = requests.get(url, timeout = 2)
    print(f"Testing link of each page. 200 is good: {response.status_code}")

    soup = BeautifulSoup(response.content, "html.parser")

    cols = ["date", "episode_number", "title", "episode_link"]
    episodes_onepage = pd.DataFrame(columns = cols)

    all_episodes = soup.find_all("article", class_ = "list-block post episode")
    for episode in all_episodes:
        
        episode_link = episode.find("h3", {"class": "list-title"}).find("a").get("href")
        
        span_list = []
        for span in episode.find_all("span"):
            span_list.append(span.text) 
        
        date = span_list[3]

        episode_number = span_list[2]

        episode_title = episode.find("a",{"class": "play"}).get("title")

        
        episodes_onepage = episodes_onepage.append(
            {
                "date": date,
                "episode_number": episode_number,
                "episode_title": episode_title,
                "episode_link": episode_link
            },
            ignore_index=True,
        )
        
    return episodes_onepage


#Scrape all pages or set page limit
#all_pages = soup.find("a",{"class": "page-numbers"}).find_next_siblings("a")[-1].get("data-page-number")
#max_pages = 4


def get_all_episodes(max_pages):
    link_first_part = "https://99percentinvisible.org/episodes/page/"
    link_2_part = "/?view_option=list"
    
    cols = ["date", "episode_number", "title", "episode_link"]
    episodes_multipage = pd.DataFrame(columns = cols)
    
    for page in tqdm(range(1, int(max_pages) + 1)):
        
        url = (
            link_first_part
            + str(page)
            + link_2_part
        )
        episodes_multipage = episodes_multipage.append(get_episodes(url), ignore_index=True)# = get_episodes(url).append(episodes_onepage, ignore_index=True)
        sleep(0.6) # to keep up with human speed we need to slow down program
    
    return episodes_multipage

# episode description extracted from each link
# appended to existing dataframe
def get_description(max_pages):
    """
    max_pages is the number of 99pi website episode list pages to scrape
    
    get_description calls 'get_all_episodes' which calls 'get_episodes'
    
    episode description extracted from each link
    appended to dataframe from get_all_episodes
    """
    df = get_all_episodes(max_pages)
    description = []
    print(f"Pulling every episode's description:")
    for link in tqdm(df["episode_link"]):
        
        response2 = requests.get(link, timeout=15)
        soup2 = BeautifulSoup(response2.content, "html.parser")
        
        try: 
            footer = soup2.find("footer")
            desc = footer.find("div", {"class": "credit"}).find("p").contents
            description.append(desc)
        except:
            description.append('NA')
    
    df["description"] = description
    
    return df



In [2]:
url = 'https://99percentinvisible.org/episodes/?view_option=list'


def total_episode_pages(url): 
    response = requests.get(url, timeout = 2)
    soup = BeautifulSoup(response.content, "html.parser")

    total_pages = soup.find("a",{"class": "page-numbers"}).find_next_siblings("a")[-1].get("data-page-number")
    return f"99% Invisible has {total_pages} pages of episodes"
    
total_episode_pages(url)

'99% Invisible has 16 pages of episodes'

In [3]:
def author_episodes(max_pages):
    '''
    returns only episodes that contain the phrase "author of" in the credits. 
    '''
    df_allepisodes = get_description(max_pages)

    df_credits = df_allepisodes[df_allepisodes.description != 'NA']
    df_credits['description'] = df_credits['description'].astype(str)
    df_authors = df_credits[df_credits['description'].str.contains("author of")].reset_index(drop=True)

    return df_authors

In [5]:
df_authors = author_episodes(16)
df_authors

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

^ scraping basic episode information. 20 episodes per page
Testing link of each page. 200 is good: 200
^ scraping basic episode information. 20 episodes per page
Testing link of each page. 200 is good: 200
^ scraping basic episode information. 20 episodes per page
Testing link of each page. 200 is good: 200
^ scraping basic episode information. 20 episodes per page
Testing link of each page. 200 is good: 200
^ scraping basic episode information. 20 episodes per page
Testing link of each page. 200 is good: 200
^ scraping basic episode information. 20 episodes per page
Testing link of each page. 200 is good: 200
^ scraping basic episode information. 20 episodes per page
Testing link of each page. 200 is good: 200
^ scraping basic episode information. 20 episodes per page
Testing link of each page. 200 is good: 200
^ scraping basic episode information. 20 episodes per page
Testing link of each page. 200 is good: 200
^ scraping basic episode information. 20 episodes per page
Testing link o

HBox(children=(FloatProgress(value=0.0, max=466.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_credits['description'] = df_credits['description'].astype(str)


Unnamed: 0,date,episode_number,title,episode_link,episode_title,description
0,03.08.21,Episode 434,,https://99percentinvisible.org/episode/artisti...,Artistic License,['Reporter Daniel Ackerman spoke with Rick Jus...
1,02.16.21,Episode 431,,https://99percentinvisible.org/episode/12-head...,12 Heads from the Garden of Perfect Brightness,"['Producer Vivian Le spoke with Ai Weiwei, art..."
2,02.08.21,Episode 430,,https://99percentinvisible.org/episode/the-doo...,The Doom Boom,"['Host Roman Mars spoke with Bradley Garrett, ..."
3,11.03.20,Episode 420,,https://99percentinvisible.org/episode/the-los...,The Lost Cities of Geo,"['Producer Vivian Le spoke with David Bohnett,..."
4,09.22.20,Episode 414,,https://99percentinvisible.org/episode/the-add...,The Address Book,"['Host Roman Mars spoke with Deirdre Mask, aut..."
...,...,...,...,...,...,...
75,12.16.14,Episode 145,,https://99percentinvisible.org/episode/octotho...,Octothorpe,"['Producer Avery Trufelman spoke with ', <a hr..."
76,12.09.14,Episode 144,,https://99percentinvisible.org/episode/there-i...,There Is a Light That Never Goes Out,['This episode was adapted from a piece that '...
77,10.22.14,Episode 137,,https://99percentinvisible.org/episode/good-br...,Good Bread,['99% Invisible wonder boy\xa0Sam Greenspan sp...
78,07.29.14,Episode 125,,https://99percentinvisible.org/episode/duplite...,Duplitecture,['Producer Avery Trufelman spoke with Bianca B...


In [6]:
df_authors.to_csv('author_episodes_16pgs.csv', encoding='utf-8', index=False)