<a href="https://colab.research.google.com/github/JasonAHeron/PodBlock/blob/main/podcast_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install feedparser pydub google-cloud-speech recordtype

In [1]:
import feedparser
import os
from recordtype import recordtype
import requests
import tensorflow as tf
from pydub import AudioSegment
from google.colab import drive
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google.cloud import speech
import random
from difflib import SequenceMatcher
from tqdm import tqdm

In [None]:
auth.authenticate_user()
drive.mount('/content/drive', force_remount=True)
os.environ['GCP_PROJECT'] = 'podblock'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/drive/My Drive/podblock/gc-creds.json'
speech_client = speech.SpeechClient()
gcs_service = build('storage', 'v1')

In [None]:
second = 1000
minute = 60 * second
daily_rss_url = "https://feeds.simplecast.com/54nAGcIl"
daily_rss_feed = feedparser.parse(daily_rss_url)
daily_rss_entries = daily_rss_feed.entries
Episode = recordtype('Episode', 'title url file audio_segment')
episodes = []
for entry in daily_rss_entries:
  episodes.append(Episode(entry['title'], entry['links'][1]['href'], '', ''))
print(f"Ready to process {len(episodes)} episodes")

In [None]:
podblock_audio_directory = "/content/drive/My Drive/podblock/audio"
training_file_directory = f"{podblock_audio_directory}/train"
training_content_directory = f"{training_file_directory}/content"
training_ad_directory = f"{training_file_directory}/ad"
test_file_directory = f"{podblock_audio_directory}/test"
test_content_directory = f"{test_file_directory}/content"
test_ad_directory = f"{test_file_directory}/ad"
ad_text_directory = "/content/drive/My Drive/podblock/ad_text_for_deduping"


def makeDir(dir):
  try:
    os.mkdir(dir)
  except FileExistsError:
      print("Folder already found")

makeDir(podblock_audio_directory)
makeDir(training_file_directory)
makeDir(test_file_directory)
makeDir(training_content_directory)
makeDir(test_content_directory)
makeDir(test_ad_directory)
makeDir(ad_text_directory)
makeDir(training_ad_directory)

In [7]:
def uploadAudioSegmentToGcs(episode, start, end):
  print(f"Uploading flac encoded audio to GCS for: {episode.title}")
  flac_dir = f"{episode.file[:-3]}flac"
  upload_filename = flac_dir.split('/')[-1]
  episode.audio_segment[start:end].export(flac_dir, format = "flac")
  media = MediaFileUpload(flac_dir, resumable=True)
  request = gcs_service.objects().insert(bucket='podblock_audio_full', 
                                         name=upload_filename,
                                         media_body=media)
  response = None
  while response is None:
    progress, response = request.next_chunk()
  return f"gs://podblock_audio_full/{upload_filename}"

In [8]:
def longRecognizeSTT(gcs_uri):
  audio = speech.RecognitionAudio(uri=gcs_uri)
  config = speech.RecognitionConfig(
      encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
      audio_channel_count=1,
      #enable_word_time_offsets=True,
      #enable_automatic_punctuation=True,
      model="video",
      use_enhanced=True,
      language_code="en-US",
  )
  # Detects speech in the audio file
  operation = speech_client.long_running_recognize(config=config, audio=audio)
  print(f"Recognizing speech in : {gcs_uri}")
  response = operation.result()
  print("Speech recognition completed")
  return response

def shortRecognizeSTT(file):
  print(f"Short recognize for: {file}")
  with open(file, "rb") as audio_file:
    content = audio_file.read()
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
      encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
      audio_channel_count=1,
      model="video",
      use_enhanced=True,
      language_code="en-US",
    )
    operation = speech_client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=90)
    output = ''
    for result in response.results:
      output += result.alternatives[0].transcript
    return output

In [9]:
def uploadRecognizedTextToDrive(title, stt_response):
  with open(f"/content/drive/My Drive/podblock/podcast_text/{title}.txt", 'w') as f:
    for result in stt_response.results:
      f.write("Result: {}\n".format(result))

def uploadAdDedupeTextToDrive(text, filename):
  with open(f"{ad_text_directory}/{filename}.txt", 'w') as f:
    f.write(text)

def loadAdDedupTextFromDrive():
  known_ad_text = set()
  for root, dirs, files in os.walk(ad_text_directory):
    for file in files:
      with open(os.path.join(root, file), 'r') as ad_text:
        known_ad_text.add(ad_text.read())
  return known_ad_text

In [15]:
def audioAlreadyProcessed(episode):
  return os.path.exists(f"{training_content_directory}/{episode.title}.wav") or os.path.exists(f"{test_content_directory}/{episode.title}.wav")

def randomUpload(audio_segment, title, isAd=True):
  coin_flip = random.randint(0, 1)
  if(isAd):
    upload_location = training_ad_directory if coin_flip else test_ad_directory
    audio_segment.export(f"{upload_location}/{title}.wav", format="wav", bitrate='16k')
  else:
    upload_location = training_content_directory if coin_flip else test_content_directory
    audio_segment.export(f"{upload_location}/{title}.wav", format="wav", bitrate='16k')

def isKnownAd(ad_text, known_ad_text):
  for known_ad in known_ad_text:
    ratio = SequenceMatcher(None, ad_text, known_ad).ratio()
    if ratio > .6:
      print(f"Found known ad\n{known_ad}\n{ad_text}\nRATIO: {ratio}")
      return True
  return False


def encodeAndUploadAudioSegments(episode, known_ad_text):
  print(f"Uploading wav encoded audio segments to drive")
  preroll_ad = episode.audio_segment[:30 * second]
  preroll_file = f"{episode.file[:-4]}-preroll.flac"
  preroll_ad.export(preroll_file, format = "flac")
  preroll_text = shortRecognizeSTT(preroll_file)

  # postroll_ad = episode.audio_segment[-30 * second:]
  # postroll_file = f"{episode.file[:-4]}-postroll.flac"
  # postroll_ad.export(postroll_file, format = "flac")
  # postroll_text = shortRecognizeSTT(postroll_file)

  if(not isKnownAd(preroll_text, known_ad_text)):
    print(f"Found new unknown ad: {preroll_text}")
    randomUpload(preroll_ad, episode.title)
    uploadAdDedupeTextToDrive(preroll_text, f"{episode.title}-preroll")
    known_ad_text.add(preroll_text)

  # if(not isKnownAd(postroll_text, known_ad_text)):
  #   randomUpload(postroll_ad, episode.title)
  #   uploadAdDedupeTextToDrive(postroll_text, f"{episode.title}-postroll")
  #   known_ad_text.add(postroll_text)

  content = episode.audio_segment[31 * second:5 * minute]
  randomUpload(content, episode.title, False)

  os.remove(preroll_file)
  #os.remove(postroll_file)

  return known_ad_text

In [11]:
def uploadTextContent(content_gcs_uri):
  #response = longRecognizeSTT(content_gcs_uri)
  #uploadRecognizedTextToDrive(selected_episode.title, stt_response)
  pass

In [None]:
known_ad_texts = loadAdDedupTextFromDrive()
print(f"Loaded {len(known_ad_texts)} known ads")
for episode in tqdm(episodes):
  if(episode.title.startswith("The Sunday Read:")):
    continue
  ## Audio Pipeline
  if(audioAlreadyProcessed(episode)):
    print(f"Already processed audio for: {episode.title}")
  else:
    print(f"\nProcessing audio pipeline for: {episode.title}")
    episode.file = tf.keras.utils.get_file(f'{episode.title}.mp3', episode.url, cache_dir='./', cache_subdir='data')
    episode.audio_segment = AudioSegment.from_mp3(episode.file).set_channels(1)
    known_ad_texts = encodeAndUploadAudioSegments(episode, known_ad_texts)
  ## Semantic Pipeline
  #content_gcs_uri = uploadAudioSegmentToGcs(episode, 31 * second, 5 * minute)
  #uploadTextContent(content_gcs_uri)

In [17]:
drive.flush_and_unmount()
