<a href="https://colab.research.google.com/github/JasonAHeron/PodBlock/blob/main/podcast_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install feedparser pydub google-cloud-speech recordtype

In [26]:
import feedparser
import os
from recordtype import recordtype
import requests
import tensorflow as tf
from pydub import AudioSegment
from google.colab import drive
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.cloud import speech

In [2]:
auth.authenticate_user()
drive.mount('/content/drive', force_remount=True)
os.environ['GCP_PROJECT'] = 'podblock'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/drive/My Drive/podblock/gc-creds.json'

speech_client = speech.SpeechClient()
gcs_service = build('storage', 'v1')

Mounted at /content/drive


In [3]:
daily_rss_url = "https://feeds.simplecast.com/54nAGcIl"
daily_rss_feed = feedparser.parse(daily_rss_url)
daily_rss_entries = daily_rss_feed.entries

In [28]:
Episode = recordtype('Episode', 'title url file audio_segment')
episodes = []
for entry in daily_rss_entries:
  episodes.append(Episode(entry['title'], entry['links'][1]['href'], '', ''))

In [5]:
try:
  os.mkdir("/content/drive/My Drive/podblock/podcast_content")
except FileExistsError:
    print("Folder already found")
try:
  os.mkdir("/content/drive/My Drive/podblock/podcast_text")
except FileExistsError:
    print("Folder already found")
try:
  os.mkdir("/content/drive/My Drive/podblock/podcast_ads")
except FileExistsError:
    print("Folder already found")

Folder already found
Folder already found
Folder already found


In [34]:
def uploadAudioFileToGcs(episode):
  print(f"Uploading flac encoded audio to GCS for: {episode.title}")
  flac_dir = f"{episode.file[:-3]}flac"
  upload_filename = flac_dir.split('/')[-1]
  episode.audio_segment.export(flac_dir, format = "flac")
  media = MediaFileUpload(flac_dir, resumable=True)
  request = gcs_service.objects().insert(bucket='podblock_audio_full', 
                                         name=upload_filename,
                                         media_body=media)
  response = None
  while response is None:
    progress, response = request.next_chunk()
  return f"gs://podblock_audio_full/{upload_filename}"

In [30]:
def recognizeSTT(gcs_uri):
  audio = speech.RecognitionAudio(uri=gcs_uri)
  config = speech.RecognitionConfig(
      encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
      sample_rate_hertz=44100,
      audio_channel_count=1,
      enable_word_time_offsets=True,
      enable_automatic_punctuation=True,
      model="video",
      use_enhanced=True,
      language_code="en-US",
  )
  # Detects speech in the audio file
  operation = speech_client.long_running_recognize(config=config, audio=audio)
  print(f"Recognizing speech in : {gcs_uri}")
  response = operation.result()
  print("Speech recognition completed")
  return response

In [31]:
def uploadRecognizedTextToDrive(title, stt_response):
  with open(f"/content/drive/My Drive/podblock/podcast_text/{title}.txt", 'w') as f:
    for result in stt_response.results:
      f.write("Result: {}\n".format(result))

In [37]:
second = 1000
minute = 60 * second

def encodeAndUploadAudioSegments(episode):
  print(f"Uploading wav encoded audio segments to drive for: {episode.title}")
  preroll_ad = episode.audio_segment[:30 * second]
  postroll_ad = episode.audio_segment[-30 * second:]
  content = episode.audio_segment[31 * second:6 * minute]

  preroll_ad.export(f"/content/drive/My Drive/podblock/podcast_ads/ad-preroll-{episode.title}.wav", format="wav", bitrate='16k')
  postroll_ad.export(f"/content/drive/My Drive/podblock/podcast_ads/ad-postroll-{episode.title}.wav", format="wav", bitrate='16k')
  content.export(f"/content/drive/My Drive/podblock/podcast_content/content-{episode.title}.wav", format="wav", bitrate='16k')


In [35]:
stt_response = None
gcs_uri = None
selected_episode = None
for episode in episodes:
  if(input(f"Want to process {episode.title}?") == "y"):
    print(f"\n\nProcessing {episode.title}")
    episode.file = tf.keras.utils.get_file(f'{episode.title}.mp3', episode.url, cache_dir='./', cache_subdir='data')
    episode.audio_segment = AudioSegment.from_mp3(episode.file).set_channels(1)
    selected_episode = episode
    
    encodeAndUploadAudioSegments(episode)
    gcs_uri = uploadAudioFileToGcs(episode)
    #stt_response = recognizeSTT(gcs_uri)
    break

Want to process The Sunday Read: ‘The Untold Story of Sushi in America’?n
Want to process An Interview With Dr. Anthony Fauci?y
Processing An Interview With Dr. Anthony Fauci

Uploading wav encoded audio segments to drive for: An Interview With Dr. Anthony Fauci
Uploading flac encoded audio to GCS for: An Interview With Dr. Anthony Fauci
Upload ./data/An Interview With Dr. Anthony Fauci.flac complete


In [None]:
stt_response = recognizeSTT(gcs_uri)

Waiting for operation to complete...
Operation completed...


In [None]:
uploadRecognizedTextToDrive(selected_episode.title, stt_response)

In [36]:
drive.flush_and_unmount()
