<a href="https://colab.research.google.com/github/JasonAHeron/PodBlock/blob/main/podcast_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install feedparser pydub google-cloud-speech



In [None]:
import feedparser
import os
from collections import namedtuple
import requests
import tensorflow as tf
from pydub import AudioSegment
from google.colab import drive
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.cloud import speech

In [None]:
auth.authenticate_user()
drive.mount('/content/drive', force_remount=True)
os.environ['GCP_PROJECT'] = 'podblock'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/drive/My Drive/podblock/gc-creds.json'

speech_client = speech.SpeechClient()
gcs_service = build('storage', 'v1')

Mounted at /content/drive


In [None]:
daily_rss_url = "https://feeds.simplecast.com/54nAGcIl"
daily_rss_feed = feedparser.parse(daily_rss_url)
daily_rss_entries = daily_rss_feed.entries

In [None]:
Episode = namedtuple('Episode', ['title', 'url'])
episodes = []
for entry in daily_rss_entries:
  episodes.append(Episode(entry['title'], entry['links'][1]['href']))

In [None]:
try:
  os.mkdir("/content/drive/My Drive/podblock/podcast_content")
except FileExistsError:
    print("Folder already found")
try:
  os.mkdir("/content/drive/My Drive/podblock/podcast_text")
except FileExistsError:
    print("Folder already found")
try:
  os.mkdir("/content/drive/My Drive/podblock/podcast_ads")
except FileExistsError:
    print("Folder already found")

Folder already found
Folder already found
Folder already found


In [None]:
def uploadAudioFileToGcs(file_dir):
  print(f"Uploading {file_dir}")
  mp3 = AudioSegment.from_mp3(file_dir).set_channels(1)
  flac_dir = f"{file_dir[:-3]}flac"
  upload_filename = flac_dir.split('/')[-1]
  mp3.export(flac_dir, format = "flac")
  media = MediaFileUpload(flac_dir, resumable=True)
  request = gcs_service.objects().insert(bucket='podblock_audio_full', 
                                         name=upload_filename,
                                         media_body=media)
  response = None
  while response is None:
    progress, response = request.next_chunk()
  print(f'Upload {flac_dir} complete')
  return f"gs://podblock_audio_full/{upload_filename}"

In [None]:
def recognize(gcs_uri):
  audio = speech.RecognitionAudio(uri=gcs_uri)
  config = speech.RecognitionConfig(
      encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
      sample_rate_hertz=44100,
      audio_channel_count=1,
      enable_word_time_offsets=True,
      enable_automatic_punctuation=True,
      model="video",
      use_enhanced=True,
      language_code="en-US",
  )
  # Detects speech in the audio file
  operation = speech_client.long_running_recognize(config=config, audio=audio)
  print("Recognizing speech...")
  response = operation.result()
  print("Speech recognition completed...")
  return response

In [None]:
def uploadRecognizedTextToDrive(filename, response):
  with open(f"/content/drive/My Drive/podblock/podcast_text/{filename.split('.')[0]}.txt", 'w') as f:
    for result in response.results:
      f.write("Result: {}\n".format(result))

In [None]:
second = 1000
minute = 60 * second

def writeAd(ad, episode_title, ad_set, pre_roll=True):
  ad_slice = ad[4*second:-4*second]
  if ad_slice.raw_data not in ad_set:
    ad.export(f"/content/drive/My Drive/podblock/podcast_ads/ad{'preroll' if pre_roll else ''}-{episode_title}.wav", format="wav", bitrate='16k')
    ad_set.add(ad_slice)
    return True
  return False

def encodeAndUploadAudioSegments(filename, ad_set):
  podcast = AudioSegment.from_mp3(episode_dir).set_channels(1)
  preroll_ad = podcast[:30 * second]
  postroll_ad = podcast[-30 * second:]
  content = podcast[31 * second:6 * minute]
  if writeAd(preroll_ad, episode.title, ad_set):
    output += f'wrote preroll\t'
  if writeAd(postroll_ad, episode.title, ad_set, pre_roll=False):
    output += f'wrote postroll\t'
  content.export(f"/content/drive/My Drive/podblock/podcast_content/content-{episode.title}.wav", format="wav", bitrate='16k')
  output += f'ads captured {ad_set.length}'
  print(output)

In [None]:
ad_set = set()
response = None
filename = None
gcs_uri = None
for episode in episodes[:1]:
  print(f"Processing {episode.title}\n")
  episode_dir = tf.keras.utils.get_file(f'{episode.title}.mp3', episode.url, cache_dir='./', cache_subdir='data')
  filename = episode_dir.split('/')[-1]
  gcs_uri = uploadAudioFileToGcs(episode_dir)
  response = recognize(gcs_uri)

Processing ‘How Did We Let People Die This Way?’

Downloading data from https://dts.podtrac.com/redirect.mp3/chrt.fm/track/8DB4DB/pdst.fm/e/nyt.simplecastaudio.com/03d8b493-87fc-4bd1-931f-8a8e9b945d8a/episodes/8ba12bd0-d8f5-4aa3-9f43-873ee0ccaf46/audio/128/default.mp3?aid=rss_feed&awCollectionId=03d8b493-87fc-4bd1-931f-8a8e9b945d8a&awEpisodeId=8ba12bd0-d8f5-4aa3-9f43-873ee0ccaf46&feed=54nAGcIl
Uploading ./data/‘How Did We Let People Die This Way?’.mp3
Upload ./data/‘How Did We Let People Die This Way?’.flac complete
Waiting for operation to complete...
Operation completed...


In [None]:
response = recognize(gcs_uri)

Waiting for operation to complete...
Operation completed...


In [None]:
uploadRecognizedTextToDrive(filename, response)

In [None]:
drive.flush_and_unmount()
