In [41]:
"""
99% Invisible Booklist
Credits NLP 
Author: Lindsey Viann Parkinson Last updated: March 16, 2021
Tokenize the credits scraped earlier and extract authors and title
"""

# Packages
import pandas as pd
from tqdm.notebook import tqdm
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk



In [42]:
credits_df = pd.read_csv('author_episodes_16pgs.csv')
desc_col = credits_df["description"]

## Creating titles column

In [43]:
def titles_from_credits(desc_col): 
    book_title = []
    regex_title = r'(?<=<em>)(.*?)(?=<\/em>)' #italics
    for credit in desc_col:  
        if "<em>" in credit:
            book_title.append(re.findall(regex_title,credit))
        else:
            book_title.append(re.findall(r'(?<=author of )(.*?)(?=\;)', credit)) #between "author of" and a ";"
    return book_title


def clean_titles(desc_col):
    regex_title_href = r'(?<=blank">)(.*?)(?=<\/a>)' 
    all_titles=[]
    for titles_list in titles_from_credits(desc_col):
        title2 =[]
        for item in titles_list:
            if "href" in item:
                string = re.findall(regex_title_href, item) #between pieces of hyperlink with title
                title2.append(string)
            else:
                title2.append(item)
        all_titles.append(title2)
    return all_titles

In [44]:
book_title = clean_titles(desc_col)

In [45]:
credits_df['book_title'] = book_title

remove_chars = ['[', ']', "'"]
credits_df['book_title'] = credits_df['book_title'].astype(str).str.translate({ord(elem): None for elem in remove_chars} )

credits_df

Unnamed: 0,date,episode_number,title,episode_link,episode_title,description,book_title
0,03.08.21,Episode 434,,https://99percentinvisible.org/episode/artisti...,Artistic License,['Reporter Daniel Ackerman spoke with Rick Jus...,"“Speaking of Idaho” blog, Taking the Wheel: Wo..."
1,02.16.21,Episode 431,,https://99percentinvisible.org/episode/12-head...,12 Heads from the Garden of Perfect Brightness,"['Producer Vivian Le spoke with Ai Weiwei, art...","Never Forget National Humiliation;\xa0, Chines..."
2,02.08.21,Episode 430,,https://99percentinvisible.org/episode/the-doo...,The Doom Boom,"['Host Roman Mars spoke with Bradley Garrett, ...",Bunker: Building for the End Times
3,11.03.20,Episode 420,,https://99percentinvisible.org/episode/the-los...,The Lost Cities of Geo,"['Producer Vivian Le spoke with David Bohnett,...",Fallen Glory: The Lives and Deaths of History’...
4,09.22.20,Episode 414,,https://99percentinvisible.org/episode/the-add...,The Address Book,"['Host Roman Mars spoke with Deirdre Mask, aut...",
...,...,...,...,...,...,...,...
75,12.16.14,Episode 145,,https://99percentinvisible.org/episode/octotho...,Octothorpe,"['Producer Avery Trufelman spoke with ', <a hr...",Shady Characters: The Secret Life of Punctuati...
76,12.09.14,Episode 144,,https://99percentinvisible.org/episode/there-i...,There Is a Light That Never Goes Out,['This episode was adapted from a piece that '...,
77,10.22.14,Episode 137,,https://99percentinvisible.org/episode/good-br...,Good Bread,['99% Invisible wonder boy\xa0Sam Greenspan sp...,White Bread: A Social History of the Store-Bou...
78,07.29.14,Episode 125,,https://99percentinvisible.org/episode/duplite...,Duplitecture,['Producer Avery Trufelman spoke with Bianca B...,


## Creating author column



In [46]:
podcast_staff = ['Roman Mars', 'Chris Berube', 'Emmett FitzGerald', 'Emmett Fitzgerald', 'Delaney Hall', 'Christopher Johnson', 'Sofia Klatzker Miller', 'Sofia Klatzer', 'Kurt Kohlstedt', 'Vivian Le', 'Abby Madan', 'Katie Mingle', 'Sean Real', 'Joe Rosenberg', 'Avery Trufelman']


In [47]:
# https://stackoverflow.com/questions/20290870/improving-the-extraction-of-human-names-with-nltk


def extract_names(desc_col):
    names = []
    for description in desc_col:          
        #short_desc = str(re.search(r'(^.*?author of)', description)) + ' foo.'
        for sent in nltk.sent_tokenize(description):
            if "author of" in sent:   
                short_sent = str(re.findall(r'(^.*?author of)', description))
                podcast_guests = []
                for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(short_sent))):
                    if hasattr(chunk, 'label'):
                        if chunk.label() == 'PERSON': 
                            name = ' '.join(c[0] for c in chunk.leaves())
                            if name not in podcast_staff: 
                                podcast_guests.append(name)
                names.append(podcast_guests[-1]) if len(podcast_guests) > 0 else names.append('NA')
                break
    return names


In [48]:
author = extract_names(desc_col)

In [49]:
credits_df['author'] = author
credits_df

Unnamed: 0,date,episode_number,title,episode_link,episode_title,description,book_title,author
0,03.08.21,Episode 434,,https://99percentinvisible.org/episode/artisti...,Artistic License,['Reporter Daniel Ackerman spoke with Rick Jus...,"“Speaking of Idaho” blog, Taking the Wheel: Wo...",Rick Just
1,02.16.21,Episode 431,,https://99percentinvisible.org/episode/12-head...,12 Heads from the Garden of Perfect Brightness,"['Producer Vivian Le spoke with Ai Weiwei, art...","Never Forget National Humiliation;\xa0, Chines...",Zheng Wang
2,02.08.21,Episode 430,,https://99percentinvisible.org/episode/the-doo...,The Doom Boom,"['Host Roman Mars spoke with Bradley Garrett, ...",Bunker: Building for the End Times,Bradley Garrett
3,11.03.20,Episode 420,,https://99percentinvisible.org/episode/the-los...,The Lost Cities of Geo,"['Producer Vivian Le spoke with David Bohnett,...",Fallen Glory: The Lives and Deaths of History’...,James Crawford
4,09.22.20,Episode 414,,https://99percentinvisible.org/episode/the-add...,The Address Book,"['Host Roman Mars spoke with Deirdre Mask, aut...",,Deirdre Mask
...,...,...,...,...,...,...,...,...
75,12.16.14,Episode 145,,https://99percentinvisible.org/episode/octotho...,Octothorpe,"['Producer Avery Trufelman spoke with ', <a hr...",Shady Characters: The Secret Life of Punctuati...,Keith Houston
76,12.09.14,Episode 144,,https://99percentinvisible.org/episode/there-i...,There Is a Light That Never Goes Out,['This episode was adapted from a piece that '...,,Mooallem
77,10.22.14,Episode 137,,https://99percentinvisible.org/episode/good-br...,Good Bread,['99% Invisible wonder boy\xa0Sam Greenspan sp...,White Bread: A Social History of the Store-Bou...,Aaron Bobrow-Strain
78,07.29.14,Episode 125,,https://99percentinvisible.org/episode/duplite...,Duplitecture,['Producer Avery Trufelman spoke with Bianca B...,,Bianca Bosker


In [55]:
ReadingList = pd.DataFrame([credits_df.book_title, credits_df.author, credits_df.episode_link]).transpose()

pd.set_option("max_rows", None)
pd.set_option('max_colwidth', None)
ReadingList

Unnamed: 0,book_title,author,episode_link
0,"“Speaking of Idaho” blog, Taking the Wheel: Women and the Coming of the Motor Age, Vintage Views along the West Michigan Pike, Mixed Speech: When speech is both private and governmental",Rick Just,https://99percentinvisible.org/episode/artistic-license/
1,"Never Forget National Humiliation;\xa0, Chinese Antiquities: An Introduction to the Art Market.",Zheng Wang,https://99percentinvisible.org/episode/12-heads-from-the-garden-of-perfect-brightness/
2,Bunker: Building for the End Times,Bradley Garrett,https://99percentinvisible.org/episode/the-doom-boom/
3,Fallen Glory: The Lives and Deaths of History’s Greatest Buildings,James Crawford,https://99percentinvisible.org/episode/the-lost-cities-of-geo/
4,,Deirdre Mask,https://99percentinvisible.org/episode/the-address-book/
5,"""Transgender History: The Roots of Today’s Revolution</span>, """,Susan Stryker,https://99percentinvisible.org/episode/where-do-we-go-from-here/
6,How The Post Office Created America: A History,Peach Springs Arizona,https://99percentinvisible.org/episode/the-revolutionary-post/
7,Policing the Open Road,Sarah Seo,https://99percentinvisible.org/episode/policing-the-open-road/
8,Heroic: Concrete Architecture and the New Boston,Adrian Forty,https://99percentinvisible.org/episode/the-smell-of-concrete-after-rain/
9,Wiped,Ron Blumer,https://99percentinvisible.org/episode/wipe-out/
