# **Google Podcast Downloader Pipeline (Notebook Version with the Code)**

## Scraping Podcasts

In [1]:
import requests
from bs4 import BeautifulSoup
import wget # to download files
import pandas as pd
import os
import utils
import yaml
import sqlite3

## Task 1: Collect Configuration

In [2]:
config = dict()
with open("config.yaml", "r") as f:
    config = yaml.load(f,Loader=yaml.FullLoader)
days_prior = config['days_prior']
download_path = config['path']

## Task 2: Collect URLs from subscribed file

In [3]:
URLs = []
with open('podcast_urls.txt') as input_file:
    lines = input_file.readlines()
    for line in lines:
        URLs.append(line.strip())

## Task 3a: Scrape podcast metadata, saving it to a dataframe


In [4]:
podcast_metadata_df = pd.DataFrame(columns =['podcast','episode','date','length'])


In [5]:
def load_podcast_metadata(df,days,url,path): 
    soup = BeautifulSoup(requests.get(url).text,'lxml')
    show_title = soup.find('div', {'class':'ZfMIwb'}).text
    try:
        os.mkdir(path+'/'+show_title.replace(" ","_"))
    except FileExistsError:
        pass
    for podcast in soup.find_all('a', {'role':'listitem'}):
        release_date = podcast.find('div',{'class':'OTz6ee'}).text
        if release_date in utils.get_time_range(days):
            if release_date.endswith('ago'):
                release_date = utils.string_to_delta(release_date)
            release_date = utils.to_datetime(release_date)
            name = podcast.find('div', {'class':'e3ZUqe'}).text        
            length = podcast.find('span', {'class':'gUJ0Wc'}).text            
            metadata = [show_title,name,release_date,length]
            df.loc[-1] = metadata
            df.index = df.index + 1  # shifting index
            df.sort_index(inplace=True) 

In [6]:
for url in URLs:
    load_podcast_metadata(podcast_metadata_df,days_prior,url,download_path)
podcast_metadata_df['date'] = pd.to_datetime(podcast_metadata_df['date'])
podcast_metadata_df = podcast_metadata_df.sort_values(by='date',ascending=False)            
podcast_metadata_df.head()

Unnamed: 0,podcast,episode,date,length
0,Lex Fridman Podcast,"#350 – Betül Kaçar: Origin of Life, Ancient DN...",2022-12-30,2 hr 48 min
2,History of the World podcast,Unscripted (48) ( Boxing Day ),2022-12-26,13 min
1,History of the World podcast,Unscripted (47) ( Christmas Day ),2022-12-25,16 min


## Task 4a: Store this metadata in an SQL database

In [9]:
database = "metadata.sqlite"
connection = sqlite3.connect(database)
podcast_metadata_df.to_sql('podcasts_metadata',connection,if_exists='append',index=False)
cursor = connection.cursor()

In [12]:
cursor.execute("SELECT * FROM podcasts_metadata ORDER BY date DESC LIMIT 5")
rows = cursor.fetchall()
for row in rows:
    print(row)

('Lex Fridman Podcast', '#350 – Betül Kaçar: Origin of Life, Ancient DNA, Panspermia, and Aliens', '2022-12-30 00:00:00', '2 hr 48 min')
('Lex Fridman Podcast', '#350 – Betül Kaçar: Origin of Life, Ancient DNA, Panspermia, and Aliens', '2022-12-30 00:00:00', '2 hr 48 min')
('History of the World podcast', 'Unscripted (48) ( Boxing Day )', '2022-12-26 00:00:00', '13 min')
('History of the World podcast', 'Unscripted (48) ( Boxing Day )', '2022-12-26 00:00:00', '13 min')
('History of the World podcast', 'Unscripted (47) ( Christmas Day )', '2022-12-25 00:00:00', '16 min')


In [13]:
connection.close()

## Task 3b: Scrape the podcast audio and save to local

In [11]:
def download_podcast_audio(days,url,path): 
    soup = BeautifulSoup(requests.get(url).text,'lxml')
    show_title = soup.find('div', {'class':'ZfMIwb'}).text
    for podcast in soup.find_all('a', {'role':'listitem'}):
        release_date = podcast.find('div',{'class':'OTz6ee'}).text
        if release_date in utils.get_time_range(days):
            name = podcast.find('div', {'class':'e3ZUqe'}).text
            print("\nDownloading: "+show_title+" - "+name) 
            url = podcast.find('div', {'jsname':'fvi9Ef'})['jsdata'].split(';')[1]
            filename = wget.download(url, out=path+'/'+show_title.replace(" ","_"))
            os.rename(filename,path+'/'+show_title.replace(" ","_")+'/'+name+'.mp3')

In [12]:
for url in URLs:
    download_podcast_audio(days_prior,url,download_path)


Downloading: History of the World podcast - Unscripted (48) ( Boxing Day )
100% [....................................................] 12485863 / 12485863
Downloading: History of the World podcast - Unscripted (47) ( Christmas Day )
100% [....................................................] 15731440 / 15731440
Downloading: Lex Fridman Podcast - #349 – Bhaskar Sunkara: The Case for Socialism
100% [..................................................] 168902536 / 168902536

## Task 5ab: end