# Part 1: Podcast Transcription and Information Extraction

## Step 1 - Retrieve the audio file

In [None]:
%pip install feedparser

In [1]:
import feedparser
podcast_feed_url = "https://feeds.acast.com/public/shows/d556eb54-6160-4c85-95f4-47d9f5216c49"
podcast_feed = feedparser.parse(podcast_feed_url)

In [2]:
for item in podcast_feed.entries[0].links:
  if (item['type'] == 'audio/mpeg'):
    episode_url = item.href

## Step 2 - Transcribe the audio file

In [None]:
%pip install git+https://github.com/openai/whisper.git  -q

In [3]:
import pathlib
import whisper

In [4]:
# Perform download only once and save to Network storage
model_path = pathlib.Path.cwd()
if model_path.joinpath("medium.pt").exists():
  print ("Model has been downloaded, no re-download necessary")
else:
  print ("Starting download of Whisper Model")
  whisper._download(whisper._MODELS["medium"], str(model_path), False)

Model has been downloaded, no re-download necessary


In [5]:
# Load model from saved location
model = whisper.load_model('medium', device='cpu', download_root=model_path)

In [6]:
%%time
result = model.transcribe("podcast_episode.mp3")



CPU times: total: 4min 29s
Wall time: 10min 13s


In [7]:
# Check the transcription happened correctly by peeking into the first 1000 characters
podcast_transcript = result['text']
result['text'][:1000]

" I am Charlotte Casaragui and in partnership with the House of Chanel, I present to you the Les Rencontres podcast. As part of the Rendez-vous littéraire at Rue Cambon, this podcast spotlights the birth of a female writer. You can listen to the various episodes and their authors on your preferred streaming platforms. Hello and welcome to The Intelligence from The Economist. I'm Jason Palmer. And I'm Auret Ogunbiyi. Every weekday we provide a fresh perspective on the events shaping your world. Around now, lots of kids are heading back to school. But in America, two years after a post-pandemic grand reopening, terrifying proportion of students just aren't in class. And if you've ever tried to get a rickshaw or a tuk-tuk in Bangalore, you know just how stressful it can be. Now there's a new app trying to change that. And that's good news not just for the customers, but for the drivers too. First up though. For decades, America and the Soviet Union were locked in a tense and often terrify

In [8]:
podcast_transcript

" I am Charlotte Casaragui and in partnership with the House of Chanel, I present to you the Les Rencontres podcast. As part of the Rendez-vous littéraire at Rue Cambon, this podcast spotlights the birth of a female writer. You can listen to the various episodes and their authors on your preferred streaming platforms. Hello and welcome to The Intelligence from The Economist. I'm Jason Palmer. And I'm Auret Ogunbiyi. Every weekday we provide a fresh perspective on the events shaping your world. Around now, lots of kids are heading back to school. But in America, two years after a post-pandemic grand reopening, terrifying proportion of students just aren't in class. And if you've ever tried to get a rickshaw or a tuk-tuk in Bangalore, you know just how stressful it can be. Now there's a new app trying to change that. And that's good news not just for the customers, but for the drivers too. First up though. For decades, America and the Soviet Union were locked in a tense and often terrify

## Step 3 - Create a summary of the podcast

In [None]:
%pip install openai
%pip install tiktoken

In [9]:
import openai
from getpass import getpass

openai.api_key = getpass('Enter the OpenAI API Key in the cell  ')

In [None]:
# we can confirm that the API key works by listing all the OpenAI models
models = openai.Model.list()
for model in models["data"]:
  print (model["root"])

In [11]:
import tiktoken
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
print ("Number of tokens in input prompt ", len(enc.encode(podcast_transcript)))

Number of tokens in input prompt  4103


In [12]:
instructPrompt = """
Please provide a summary of the following podcast, the transcript for which is provided below.

Here is how I would like you to perform this task:
- Identify the main speakers or participants in the podcast, and classify them as hosts or guests. Generally the podcast will be narrated by one or more hosts. You can generally identify a guest because they will be introduced by the host, and will then be interviewed by the host. For each participant, identify what organization they belong to (if any) and what their title is. Not every episode will have a guest.
- Break the podcast into sections, based on topics or themes.
- For each topic, summarize the topic with a concise summary.

When printing your summary, please organize it as follows:
- At the beginning of your response, for each podcast guest (not including the host/s), print the following information in the following format: GUEST NAME: [name]\nGUEST ORGANIZATION: [org]\nGUEST TITLE: [title]
- If there is no guest, then set [name] [org] and [title] equal to "none"
- After printing the guest information, print the summary of the episode, breaking it into the individual topics if needed.
- Finally, print any interesting takeaways from the episode. These should be items that are the most interesting or surprising moments in the entire episode. Please limit these to a maximum of 3 bullet points.

Here is the podcast transcript:


"""

request = instructPrompt + podcast_transcript

In [13]:
chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
                                            messages=[{"role": "system", "content": "You are a helpful assistant."},
                                                      {"role": "user", "content": request}
                                                      ]
                                            )

In [14]:
podcastSummary = chatOutput.choices[0].message.content
podcastSummary

"none\nSummary:\n- The podcast discusses the state of nuclear arms control and the potential for a new arms race. The breakdown of arms control treaties and the development of new technologies like hypersonic missiles and artificial intelligence have created a more unpredictable and unstable environment.\n- The guest, Anton LaGuardia, The Economist's diplomatic editor, explains that several factors contribute to this new arms race, including the war in Ukraine, the rise of China's nuclear stockpile, the breakdown of arms control agreements between the US and Russia, and advancements in technology.\n- It is suggested that countries like the US, Russia, and China need to start talking and working towards a new arms control agreement to prevent further escalation. The importance of maintaining current limits and avoiding panic is emphasized.\n- Another topic discussed is the decline in attendance and enrollment in American schools following the pandemic. Data shows that a significant numb

## Step 4 - Using functions to extract additional information to provide additional context on the episode

In [15]:
request = podcast_transcript[:5000]
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
print ("Number of tokens in input prompt ", len(enc.encode(request)))

Number of tokens in input prompt  1013


In [16]:
completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": request}],
    functions=[
    {
        "name": "get_podcast_guest_information",
        "description": "Get information on the podcast guest using their name to search on Wikipedia",
        "parameters": {
            "type": "object",
            "properties": {
                "guest_name": {
                    "type": "string",
                    "description": "The name of the guest who is speaking in the podcast",
                },
                "unit": {"type": "string"},
            },
            "required": ["guest_name"],
        },
    }
    ],
    function_call={"name": "get_podcast_guest_information"}
    )

In [None]:
completion

In [18]:
import json

podcast_guest = ""
response_message = completion["choices"][0]["message"]
if response_message.get("function_call"):
  function_name = response_message["function_call"]["name"]
  function_args = json.loads(response_message["function_call"]["arguments"])
  podcast_guest=function_args.get("guest_name")

print ("Podcast Guest is ", podcast_guest)

Podcast Guest is  Anton LaGuardia


In [None]:
%pip install wikipedia

In [None]:
import wikipedia
input = wikipedia.page(podcast_guest, auto_suggest=False)

In [None]:
podcast_guest_info = input.summary
print (podcast_guest_info)

### adjust prompt to extract full name of podcast guest

In [21]:
import re

text = podcastSummary
# Regular expression to extract guest information
pattern = r"GUEST NAME: (?P<name>.+?)\nGUEST ORGANIZATION: (?P<org>.+?)\nGUEST TITLE: (?P<title>.+?)\n\n"

# Extracting the details using regex
matches = list(re.finditer(pattern, text))

guest_details = [{"name": match.group("name"), "organization": match.group("org"), "title": match.group("title")} for match in matches]

print(guest_details)

[]


In [22]:
request = str(guest_details)
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
print ("Number of tokens in input prompt ", len(enc.encode(request)))

Number of tokens in input prompt  1


In [23]:
completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": request}],
    functions=[
    {
        "name": "get_podcast_guest_information",
        "description": "Get information on the podcast guest using their full name and the name of the organization they are part of to search for them on Wikipedia or Google.",
        "parameters": {
            "type": "object",
            "properties": {
                "guest_name": {
                    "type": "string",
                    "description": "The full name of the guest who is speaking in the podcast",
                },
                "guest_organization": {
                    "type": "string",
                    "description": "The full name of the organization that the podcast guest belongs to or runs",
                },
                "guest_title": {
                    "type": "string",
                    "description": "The title, designation or role of the podcast guest in their organization",
                },
            },
            "required": ["guest_name"],
        },
    }
],
function_call={"name": "get_podcast_guest_information"}
)

In [24]:
import json

podcast_guest = ""
podcast_guest_org = ""
podcast_guest_title = ""
response_message = completion["choices"][0]["message"]
if response_message.get("function_call"):
  function_name = response_message["function_call"]["name"]
  function_args = json.loads(response_message["function_call"]["arguments"])
  podcast_guest=function_args.get("guest_name")
  podcast_guest_org=function_args.get("guest_organization")
  podcast_guest_title=function_args.get("guest_title")

In [25]:
print (podcast_guest)
print (podcast_guest_org)
print (podcast_guest_title)

John Doe
ABC Company
CEO


In [26]:
if podcast_guest_org is None:
  podcast_guest_org = ""
if podcast_guest_title is None:
  podcast_guest_title = ""

In [27]:
input = wikipedia.page(podcast_guest + " " + podcast_guest_org + " " + podcast_guest_title, auto_suggest=True)

In [28]:
input.summary

"Jeffrey Edward Epstein ( EP-steen; January 20, 1953 – August 10, 2019) was an American sex offender and financier. Epstein, who was born and raised in New York City, began his professional life by teaching at the Dalton School despite lacking a college degree. After his dismissal from the school, he entered the banking and finance sector, working at Bear Stearns in various roles before starting his own firm. Epstein developed an elite social circle and procured many women and children whom he and his associates sexually abused.In 2005, police in Palm Beach, Florida, began investigating Epstein after a parent reported that he had sexually abused her 14-year-old daughter. Epstein pleaded guilty and was convicted in 2008 by a Florida state court of procuring a child for prostitution and of soliciting a prostitute. He served almost thirteen months in custody, but with extensive work release. He was convicted of only these two crimes as part of a controversial plea deal; federal officials 

### Try Google Search using langchain to find guest info

In [None]:
%pip install langchain
%pip install google-search-results
%pip install openai

In [29]:
# Get description of guest via LangChain SERP API tool
from langchain.llms import OpenAI
from langchain.agents import load_tools
from langchain.agents import initialize_agent
import os

os.environ['OPENAI_API_KEY'] = "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"
os.environ["SERPAPI_API_KEY"] = "f6c21e07e6e07ef35236119333a7317262bb6e0f5a0e751a3cd301170353533c"

llm = OpenAI(temperature=0.3)
tools = load_tools(["serpapi"], llm=llm)
agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True)

# Using the agent to get a description of the podcast guest
query = f"Describe this person: {podcast_guest} {podcast_guest_org} {podcast_guest_title}"
result = agent.run(query)
podcast_guest_summary = result
print(result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find out more information about John Doe
Action: Search
Action Input: "John Doe ABC Company CEO"[0m
Observation: [36;1m[1;3mpresident at abc company · View mutual connections with doe · Welcome back · Experience · Explore collaborative articles · Others named doe john · View doe's full ...[0m
Thought:[32;1m[1;3m I now have enough information to answer the question
Final Answer: John Doe is the CEO of ABC Company. He has experience in the industry and is connected to many people in the business.[0m

[1m> Finished chain.[0m
John Doe is the CEO of ABC Company. He has experience in the industry and is connected to many people in the business.


In [30]:
from serpapi import GoogleSearch

image_query = f"{podcast_guest} {podcast_guest_org} {podcast_guest_title}"
search = GoogleSearch({
    "engine": "google_images",
    "q": image_query,
    "api_key": os.environ["SERPAPI_API_KEY"]
})
image_search_result = search.get_dict()
print(image_search_result)
image_url = image_search_result['images_results'][0]['original']
print(image_url)

{'search_metadata': {'id': '64ff432e654a8ccc86af2147', 'status': 'Success', 'json_endpoint': 'https://serpapi.com/searches/72cdd2f777018f17/64ff432e654a8ccc86af2147.json', 'created_at': '2023-09-11 16:41:18 UTC', 'processed_at': '2023-09-11 16:41:18 UTC', 'google_images_url': 'https://www.google.com/search?q=John+Doe+ABC+Company+CEO&oq=John+Doe+ABC+Company+CEO&hl=en&gl=us&tbm=isch', 'raw_html_file': 'https://serpapi.com/searches/72cdd2f777018f17/64ff432e654a8ccc86af2147.html', 'total_time_taken': 2.15}, 'search_parameters': {'engine': 'google_images', 'q': 'John Doe ABC Company CEO', 'google_domain': 'google.com', 'hl': 'en', 'gl': 'us', 'device': 'desktop'}, 'search_information': {'image_results_state': 'Results for exact spelling', 'menu_items': [{'position': 1, 'title': 'All', 'link': 'https://www.google.com/search?q=John+Doe+ABC+Company+CEO&source=lmns&gl=us&hl=en&sa=X&ved=2ahUKEwit95f3_6KBAxU6IGIAHW7FBocQ0pQJKAB6BAgBEAI', 'serpapi_link': 'https://serpapi.com/search.json?device=des

In [31]:
podcast_guest = {
    "name": podcast_guest,
    "organization": podcast_guest_org,
    "title": podcast_guest_title,
    "summary": podcast_guest_summary,
    "profile_picture": image_url
}

print(podcast_guest)

{'name': 'John Doe', 'organization': 'ABC Company', 'title': 'CEO', 'summary': 'John Doe is the CEO of ABC Company. He has experience in the industry and is connected to many people in the business.', 'profile_picture': 'https://media.licdn.com/dms/image/C5103AQFl656k2-DwOg/profile-displayphoto-shrink_100_100/0/1517034956958?e=1697068800&v=beta&t=C29n-fG57W0PWit1iiZ7Gmh8qy5AaQ_q6RYvvYtRLHM'}


## Step 5 - Extract the Highlights of the podcast

In [32]:
instructPrompt = """
You are a podcast editor and producer. You are provided with the transcript of a podcast episode and have to identify the 5 most significant moments in the podcast as highlights
- Each highlight needs to be a statement by one of the podcast guests
- Each highlight has to be impactful and an important takeaway from this podcast episode
- Each highlight must be concise and make listeners want to hear more about why the podcast guest said that
- The highlights that you pick must be spread out throughout the episode

Provide only the highlights and nothing else. Provide the full sentence of the highlight and format it as follows -

- Highlight 1 of the podcast
- Highlight 2 of the podcast
- Highlight 3 of the podcast
"""

request = instructPrompt + podcast_transcript

In [33]:
chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
                                            messages=[{"role": "system", "content": "You are a helpful assistant."},
                                                      {"role": "user", "content": request}
                                                      ]
                                            )

In [34]:
chatOutput.choices[0].message.content

"Highlights from the podcast:\n- The world is on the brink of a new nuclear arms race and one that will be harder to stop because it's more complex.\n- Nuclear weapons are inherently dangerous, but there are several reasons for which this is a different moment.\n- The war in Ukraine and the rise of China are factors contributing to the breakdown of arms control.\n- China is building up its nuclear stockpile quickly, and there is no sign of a follow-on agreement to New START.\n- Three-way deterrence between the United States, Russia, and China is complicated and potentially unstable.\n- Schools in the United States are struggling to bring students back to class after the pandemic, with chronic absenteeism on the rise.\n- Some families are choosing alternative forms of education, such as homeschooling, leading to enrollment declines.\n- An app called Namayathri is improving the experience for rickshaw drivers and riders in Bangalore by directly connecting them without any commission fees

# Part 2: On-demand information extraction

In [None]:
%pip install feedparser
%pip install git+https://github.com/openai/whisper.git  -q
%pip install requests

In [35]:
def get_transcribe_podcast(rss_url, local_path):
  print ("Starting Podcast Transcription Function")
  print ("Feed URL: ", rss_url)
  print ("Local Path:", local_path)

  # Read from the RSS Feed URL
  import feedparser
  intelligence_feed = feedparser.parse(rss_url)
  for item in intelligence_feed.entries[0].links:
    if (item['type'] == 'audio/mpeg'):
      episode_url = item.href
  episode_name = "podcast_episode.mp3"
  print ("RSS URL read and episode URL: ", episode_url)

  # Download the podcast episode by parsing the RSS feed
  from pathlib import Path
  p = Path(local_path)
  p.mkdir(exist_ok=True)

  print ("Downloading the podcast episode")
  import requests
  with requests.get(episode_url, stream=True) as r:
    r.raise_for_status()
    episode_path = p.joinpath(episode_name)
    with open(episode_path, 'wb') as f:
      for chunk in r.iter_content(chunk_size=8192):
        f.write(chunk)

  print ("Podcast Episode downloaded")

  # Load the Whisper model
  import os
  import whisper
  print ("Download and Load the Whisper model")
  model = whisper.load_model("medium")
  print (model.device)

  # Perform the transcription
  print ("Starting podcast transcription")
  result = model.transcribe(local_path + episode_name)

  # Return the transcribed text
  print ("Podcast transcription completed, returning results...")
  return result

In [36]:
output = get_transcribe_podcast("https://access.acast.com/rss/d556eb54-6160-4c85-95f4-47d9f5216c49", "")

Starting Podcast Transcription Function
Feed URL:  https://access.acast.com/rss/d556eb54-6160-4c85-95f4-47d9f5216c49
Local Path: 
RSS URL read and episode URL:  https://sphinx.acast.com/p/acast/s/theintelligencepodcast/e/64feecbe903383001173dfb0/media.mp3?tk=eyJ0ayI6ImRlZmF1bHQiLCJhZHMiOnRydWUsInNwb25zIjp0cnVlLCJzdGF0dXMiOiJwdWJsaWMifQ==&sig=xkJBfl9YyjxEokmIqD9JOb6mSyI9RCSpCpdnwlj1LoE
Downloading the podcast episode
Podcast Episode downloaded
Download and Load the Whisper model
cpu
Starting podcast transcription




Podcast transcription completed, returning results...


In [37]:
output['text'][:1000]

" Whether you're driving to work, biking to a friend's place, or on the way to your next vacation, Amazon Music has your news fix covered. As an Amazon Prime member, you have access to ad-free top podcasts. To start listening, download the Amazon Music app or visit amazon.com slash on the go news. That's amazon.com slash on the go news and listen to your favorite podcasts on the go. Hello and welcome to The Intelligence from The Economist. I'm your host, Aura Ogumbi. Every weekday, we provide a fresh perspective on the events shaping your world. Over the many months since Russia invaded Ukraine, we've brought you frequent updates on the latest developments in the economy. But today's show is something special. The Economist's editor-in-chief, Zanimentan Bedos, was in Ukraine's capital, Kyiv, on Friday to talk to President Volodymyr Zelenskyy at a conference on his country's future. Maybe you want some coffee? Coffee. You want some? Yeah. The last time they spoke was in the spring of la

### Step 1 - Create a cloud transcription function

In [None]:
%pip install modal

In [None]:
%modal token new --source corise > authenticationURL.txt

In [38]:
import getpass
import subprocess

def set_modal_token():
  token_id = getpass.getpass('Please enter your Modal token ID in the cell: ')
  token_secret = getpass.getpass('Please enter your Modal token secret in the cell:  ')

  # Using subprocess to execute the command
  subprocess.run(f"!modal token set --token-id (token_id) --token-secret (token_secret)", shell=True)

In [39]:
import getpass
import subprocess

def set_modal_token():
    token_id = getpass.getpass('Please enter your Modal token ID in the cell: ')
    token_secret = getpass.getpass('Please enter your Modal token secret in the cell: ')

    # Properly format the subprocess command string
    command = f"modal token set --token-id {token_id} --token-secret {token_secret}"

    # Using subprocess to execute the command
    subprocess.run(command, shell=True)

In [40]:
set_modal_token()

In [42]:
%%writefile podcast_backend.py
import modal

def download_whisper():
  # Load the Whisper model
  import os
  import whisper
  print ("Download the Whisper model")

  # Perform download only once and save to Container storage
  model_path = pathlib.Path.cwd()
  whisper._download(whisper._MODELS["medium"], str(model_path), False)

stub = modal.Stub("corise-podcast-project")
corise_image = modal.Image.debian_slim().pip_install("feedparser",
                                                     "https://github.com/openai/whisper/archive/9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d.tar.gz",
                                                     "requests",
                                                     "ffmpeg").apt_install("ffmpeg").run_function(download_whisper)

@stub.function(image=corise_image, gpu="any")
def get_transcribe_podcast(rss_url, local_path):
  print ("Starting Podcast Transcription Function")
  print ("Feed URL: ", rss_url)
  print ("Local Path:", local_path)

  # Read from the RSS Feed URL
  import feedparser
  intelligence_feed = feedparser.parse(rss_url)
  for item in intelligence_feed.entries[0].links:
    if (item['type'] == 'audio/mpeg'):
      episode_url = item.href
  episode_name = "podcast_episode.mp3"
  print ("RSS URL read and episode URL: ", episode_url)

  # Download the podcast episode by parsing the RSS feed
  from pathlib import Path
  p = Path(local_path)
  p.mkdir(exist_ok=True)

  print ("Downloading the podcast episode")
  import requests
  with requests.get(episode_url, stream=True) as r:
    r.raise_for_status()
    episode_path = p.joinpath(episode_name)
    with open(episode_path, 'wb') as f:
      for chunk in r.iter_content(chunk_size=8192):
        f.write(chunk)

  print ("Podcast Episode downloaded")

  # Load the Whisper model
  import os
  import whisper

  # Load model from saved location
  print ("Load the Whisper model")
  model = whisper.load_model('medium', device='cuda', download_root='/content/podcast/')

  # Perform the transcription
  print ("Starting podcast transcription")
  result = model.transcribe(local_path + episode_name)

  # Return the transcribed text
  print ("Podcast transcription completed, returning results...")
  return result

@stub.local_entrypoint()
def main(url, path):
  output = get_transcribe_podcast.call(url, path)
  print (output['text'])

Overwriting podcast_backend.py


In [48]:
!modal token set

invoke the function from the command line to start the remote execution in the cloud environment

In [49]:
!modal run podcast_backend.py --url https://access.acast.com/rss/d556eb54-6160-4c85-95f4-47d9f5216c49 --path "."

### Step 2 - Create a cloud information extraction function

### Wikipedia version

In [44]:
%%writefile podcast_backend.py
import modal

def download_whisper():
  # Load the Whisper model
  import os
  import whisper
  print ("Download the Whisper model")

  # Perform download only once and save to Container storage
  model_path = pathlib.Path.cwd()
  whisper._download(whisper._MODELS["medium"], model_path, False)


stub = modal.Stub("corise-podcast-project")
corise_image = modal.Image.debian_slim().pip_install("feedparser",
                                                     "https://github.com/openai/whisper/archive/9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d.tar.gz",
                                                     "requests",
                                                     "ffmpeg",
                                                     "openai",
                                                     "tiktoken",
                                                     "wikipedia",
                                                     "ffmpeg-python").apt_install("ffmpeg").run_function(download_whisper)

@stub.function(image=corise_image, gpu="any", timeout=600)
def get_transcribe_podcast(rss_url, local_path):
  print ("Starting Podcast Transcription Function")
  print ("Feed URL: ", rss_url)
  print ("Local Path:", local_path)

  # Read from the RSS Feed URL
  import feedparser
  intelligence_feed = feedparser.parse(rss_url)
  podcast_title = intelligence_feed['feed']['title']
  episode_title = intelligence_feed.entries[0]['title']
  episode_image = intelligence_feed['feed']['image'].href
  for item in intelligence_feed.entries[0].links:
    if (item['type'] == 'audio/mpeg'):
      episode_url = item.href
  episode_name = "podcast_episode.mp3"
  print ("RSS URL read and episode URL: ", episode_url)

  # Download the podcast episode by parsing the RSS feed
  from pathlib import Path
  p = Path(local_path)
  p.mkdir(exist_ok=True)

  print ("Downloading the podcast episode")
  import requests
  with requests.get(episode_url, stream=True) as r:
    r.raise_for_status()
    episode_path = p.joinpath(episode_name)
    with open(episode_path, 'wb') as f:
      for chunk in r.iter_content(chunk_size=8192):
        f.write(chunk)

  print ("Podcast Episode downloaded")

  # Load the Whisper model
  import os
  import whisper

  # Load model from saved location
  print ("Load the Whisper model")
  model = whisper.load_model('medium', device='cuda', download_root=model_path)

  # Perform the transcription
  print ("Starting podcast transcription")
  result = model.transcribe(local_path + episode_name)

  # Return the transcribed text
  print ("Podcast transcription completed, returning results...")
  output = {}
  output['podcast_title'] = podcast_title
  output['episode_title'] = episode_title
  output['episode_image'] = episode_image
  output['episode_transcript'] = result['text']
  return output

@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"))
def get_podcast_summary(podcast_transcript):
  import openai
  instructPrompt = """
  You are an expert copywriter who is responsible for publishing newsletters with thousands of subscribers. You recently listened to a great podcast and want to share a summary of it with your readers. Please write the summary of this podcast making sure to cover the important aspects that were discussed and please keep it concise.
  The transcript of the podcast is provided below.
  """
  request = instructPrompt + podcast_transcript
  chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
                                            messages=[{"role": "system", "content": "You are a helpful assistant."},
                                                      {"role": "user", "content": request}
                                                      ]
                                            )
  podcastSummary = chatOutput.choices[0].message.content
  return podcastSummary

@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"))
def get_podcast_guest(podcast_transcript):
  import openai
  import wikipedia
  import json
  from langchain.llms import OpenAI
  from langchain.agents import load_tools
  from langchain.agents import initialize_agent
  from serpapi import GoogleSearch
  import os
  
  request = podcast_transcript[:10000]
  completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": request}],
    functions=[
    {
        "name": "get_podcast_guest_information",
        "description": "Get information on the podcast guest using their full name and the name of the organization they are part of to search for them on Wikipedia or Google",
        "parameters": {
            "type": "object",
            "properties": {
                "guest_name": {
                    "type": "string",
                    "description": "If a guest is introduced and speaks in the podcast, the full name of the guest. If no guest is introduced, then 'None'",
                },
                "guest_organization": {
                    "type": "string",
                    "description": "If the guest_name is not 'None', The full name of the organization that the podcast guest belongs to or runs, otherwise 'None'",
                },
                "guest_title": {
                    "type": "string",
                    "description": "If the guest_organization is not 'None', The title, designation or role of the podcast guest in their organization, otherwise 'None'",
                },
            },
            "required": ["guest_name"],
        },
    }],
    function_call={"name": "get_podcast_guest_information"})
  response_message = completion["choices"][0]["message"]

  if response_message.get("function_call"):
    function_name = response_message["function_call"]["name"]
    function_args = json.loads(response_message["function_call"]["arguments"])
    podcast_guest=function_args.get("guest_name")
    podcast_guest_org=function_args.get("guest_organization")
    podcast_guest_title=function_args.get("guest_title")

  if (podcast_guest is not None):
    if (podcast_guest_org is None):
      podcast_guest_org = ""
    if (podcast_guest_title is None):
      podcast_guest_title = ""
    try:
      input = wikipedia.page(podcast_guest + " " + podcast_guest_org + " " + podcast_guest_title, auto_suggest=True)
      podcast_guest_summary = input.summary
    except wikipedia.exceptions.PageError:
      print(f'The page for guest "{podcast_guest} does not exist on Wikipedia.')
      podcast_guest_summary = "Not Available"
    except wikipedia.exceptions.DisambiguationError as e:
      print(f'The page for guest "{podcast_guest} is ambiguous. Possible matches are:')
      print(e.options)
      podcast_guest_summary = "Not Available"
  else:
    podcast_guest = "Not Available"
    podcast_guest_org = "Not Available"
    podcast_guest_title = "Not Available"
    podcast_guest_summary = "Not Available"

  podcastGuest = {}
  podcastGuest['name'] = podcast_guest
  podcastGuest['org'] = podcast_guest_org
  podcastGuest['title'] = podcast_guest_title
  podcastGuest['summary'] = podcast_guest_summary
  return podcastGuest

@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"))
def get_podcast_highlights(podcast_transcript):
  import openai
  instructPrompt = """
  You are a podcast editor and producer. You are provided with the transcript of a podcast episode and have to identify the 5 most significant moments in the podcast as highlights
  - Each highlight needs to be a statement by one of the podcast guests
  - Each highlight has to be impactful and an important takeaway from this podcast episode
  - Each highlight must be concise and make listeners want to hear more about why the podcast guest said that
  - The highlights that you pick must be spread out throughout the episode

  Provide only the highlights and nothing else. Provide the full sentence of the highlight and format it as follows -

  - Highlight 1 of the podcast
  - Highlight 2 of the podcast
  - Highlight 3 of the podcast
  """

  request = instructPrompt + podcast_transcript
  chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
                                            messages=[{"role": "system", "content": "You are a helpful assistant."},
                                                      {"role": "user", "content": request}
                                                      ]
                                            )
  podcastHighlights = chatOutput.choices[0].message.content
  return podcastHighlights

@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"), timeout=1200)
def process_podcast(url, path):
  output = {}
  podcast_details = get_transcribe_podcast.call(url, path)
  podcast_summary = get_podcast_summary.call(podcast_details['episode_transcript'])
  podcast_guest = get_podcast_guest.call(podcast_details['episode_transcript'])
  podcast_highlights = get_podcast_highlights.call(podcast_details['episode_transcript'])
  output['podcast_details'] = podcast_details
  output['podcast_summary'] = podcast_summary
  output['podcast_guest'] = podcast_guest
  output['podcast_highlights'] = podcast_highlights
  return output

@stub.local_entrypoint()
def test_method(url, path):
  output = {}
  podcast_details = get_transcribe_podcast.call(url, path)
  print ("Podcast Summary: ", get_podcast_summary.call(podcast_details['episode_transcript']))
  print ("Podcast Guest Information: ", get_podcast_guest.call(podcast_details['episode_transcript']))
  print ("Podcast Highlights: ", get_podcast_highlights.call(podcast_details['episode_transcript']))

Overwriting podcast_backend.py


### Google Search Version

In [45]:
%%writefile podcast_backend.py
import modal

def download_whisper():
  # Load the Whisper model
  import os
  import whisper
  print ("Download the Whisper model")

  # Perform download only once and save to Container storage
  model_path = pathlib.Path.cwd()
  whisper._download(whisper._MODELS["medium"], model_path, False)


stub = modal.Stub("corise-podcast-project")
corise_image = modal.Image.debian_slim().pip_install("feedparser",
                                                     "https://github.com/openai/whisper/archive/9f70a352f9f8630ab3aa0d06af5cb9532bd8c21d.tar.gz",
                                                     "requests",
                                                     "ffmpeg",
                                                     "openai",
                                                     "tiktoken",
                                                     "wikipedia",
                                                     "ffmpeg-python").apt_install("ffmpeg").run_function(download_whisper)

@stub.function(image=corise_image, gpu="any", timeout=600)
def get_transcribe_podcast(rss_url, local_path):
  print ("Starting Podcast Transcription Function")
  print ("Feed URL: ", rss_url)
  print ("Local Path:", local_path)

  # Read from the RSS Feed URL
  import feedparser
  intelligence_feed = feedparser.parse(rss_url)
  podcast_title = intelligence_feed['feed']['title']
  episode_title = intelligence_feed.entries[0]['title']
  episode_image = intelligence_feed['feed']['image'].href
  for item in intelligence_feed.entries[0].links:
    if (item['type'] == 'audio/mpeg'):
      episode_url = item.href
  episode_name = "podcast_episode.mp3"
  print ("RSS URL read and episode URL: ", episode_url)

  # Download the podcast episode by parsing the RSS feed
  from pathlib import Path
  p = Path(local_path)
  p.mkdir(exist_ok=True)

  print ("Downloading the podcast episode")
  import requests
  with requests.get(episode_url, stream=True) as r:
    r.raise_for_status()
    episode_path = p.joinpath(episode_name)
    with open(episode_path, 'wb') as f:
      for chunk in r.iter_content(chunk_size=8192):
        f.write(chunk)

  print ("Podcast Episode downloaded")

  # Load the Whisper model
  import os
  import whisper

  # Load model from saved location
  print ("Load the Whisper model")
  model = whisper.load_model('medium', device='cuda', download_root=model_path)

  # Perform the transcription
  print ("Starting podcast transcription")
  result = model.transcribe(local_path + episode_name)

  # Return the transcribed text
  print ("Podcast transcription completed, returning results...")
  output = {}
  output['podcast_title'] = podcast_title
  output['episode_title'] = episode_title
  output['episode_image'] = episode_image
  output['episode_transcript'] = result['text']
  return output

@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"))
def get_podcast_summary(podcast_transcript):
  import openai
  instructPrompt = """
  You are an expert copywriter who is responsible for publishing newsletters with thousands of subscribers. You recently listened to a great podcast and want to share a summary of it with your readers. Please write the summary of this podcast making sure to cover the important aspects that were discussed and please keep it concise.
  The transcript of the podcast is provided below.  
  """
  request = instructPrompt + podcast_transcript
  chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
                                            messages=[{"role": "system", "content": "You are a helpful assistant."},
                                                      {"role": "user", "content": request}
                                                      ]
                                            )
  podcastSummary = chatOutput.choices[0].message.content
  return podcastSummary

@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"))
def get_podcast_guest(podcast_transcript):
  import openai
  import wikipedia
  import json
  from langchain.llms import OpenAI
  from langchain.agents import load_tools
  from langchain.agents import initialize_agent
  from serpapi import GoogleSearch
  import os
  
  request = podcast_transcript[:10000]
  completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": request}],
    functions=[
    {
        "name": "get_podcast_guest_information",
        "description": "Get information on the podcast guest using their full name and the name of the organization they are part of to search for them on Wikipedia or Google",
        "parameters": {
            "type": "object",
            "properties": {
                "guest_name": {
                    "type": "string",
                    "description": "The full name of the guest in the podcast. This person is not the podcast host. The host will be the main narrator of the podcast. You can generally identify a guest because they will be introduced by the host, and will engage in conversation with, or be interviewed by, the host. If no guest is introduced, then 'None'",
                },
                "guest_organization": {
                    "type": "string",
                    "description": "If the guest_name is not 'None', The full name of the organization that the podcast guest belongs to or runs, otherwise 'None'",
                },
                "guest_title": {
                    "type": "string",
                    "description": "If the guest_organization is not 'None', The title, designation or role of the podcast guest in their organization, otherwise 'None'",
                },
            },
            "required": ["guest_name"],
        },
    }],
    function_call={"name": "get_podcast_guest_information"})
  response_message = completion["choices"][0]["message"]

  if response_message.get("function_call"):
    function_name = response_message["function_call"]["name"]
    function_args = json.loads(response_message["function_call"]["arguments"])
    podcast_guest=function_args.get("guest_name")
    podcast_guest_org=function_args.get("guest_organization")
    podcast_guest_title=function_args.get("guest_title")

  if (podcast_guest is not None):
    if (podcast_guest_org is None):
      podcast_guest_org = ""
    if (podcast_guest_title is None):
      podcast_guest_title = ""
    try:
      input = wikipedia.page(podcast_guest + " " + podcast_guest_org + " " + podcast_guest_title, auto_suggest=True)
      podcast_guest_summary = input.summary
    except wikipedia.exceptions.PageError:
      print(f'The page for guest "{podcast_guest} does not exist on Wikipedia.')
      podcast_guest_summary = "Not Available"
    except wikipedia.exceptions.DisambiguationError as e:
      print(f'The page for guest "{podcast_guest} is ambiguous. Possible matches are:')
      print(e.options)
      podcast_guest_summary = "Not Available"
  else:
    podcast_guest = "Not Available"
    podcast_guest_org = "Not Available"
    podcast_guest_title = "Not Available"
    podcast_guest_summary = "Not Available"

  podcastGuest = {}
  podcastGuest['name'] = podcast_guest
  podcastGuest['org'] = podcast_guest_org
  podcastGuest['title'] = podcast_guest_title
  podcastGuest['summary'] = podcast_guest_summary
  return podcastGuest

@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"))
def get_podcast_highlights(podcast_transcript):
  import openai
  instructPrompt = """
  You are a podcast editor and producer. You are provided with the transcript of a podcast episode and have to identify the 5 most significant moments in the podcast as highlights
  - Each highlight needs to be a statement by one of the podcast guests
  - Each highlight has to be impactful and an important takeaway from this podcast episode
  - Each highlight must be concise and make listeners want to hear more about why the podcast guest said that
  - The highlights that you pick must be spread out throughout the episode

  Provide only the highlights and nothing else. Provide the full sentence of the highlight and format it as follows -

  - Highlight 1 of the podcast
  - Highlight 2 of the podcast
  - Highlight 3 of the podcast
  """

  request = instructPrompt + podcast_transcript
  chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
                                            messages=[{"role": "system", "content": "You are a helpful assistant."},
                                                      {"role": "user", "content": request}
                                                      ]
                                            )
  podcastHighlights = chatOutput.choices[0].message.content
  return podcastHighlights

@stub.function(image=corise_image, secret=modal.Secret.from_name("my-openai-secret"), timeout=1200)
def process_podcast(url, path):
  output = {}
  podcast_details = get_transcribe_podcast.call(url, path)
  podcast_summary = get_podcast_summary.call(podcast_details['episode_transcript'])
  podcast_guest = get_podcast_guest.call(podcast_details['episode_transcript'])
  podcast_highlights = get_podcast_highlights.call(podcast_details['episode_transcript'])
  output['podcast_details'] = podcast_details
  output['podcast_summary'] = podcast_summary
  output['podcast_guest'] = podcast_guest
  output['podcast_highlights'] = podcast_highlights
  return output

@stub.local_entrypoint()
def test_method(url, path):
  output = {}
  podcast_details = get_transcribe_podcast.call(url, path)
  print ("Podcast Summary: ", get_podcast_summary.call(podcast_details['episode_transcript']))
  print ("Podcast Guest Information: ", get_podcast_guest.call(podcast_details['episode_transcript']))
  print ("Podcast Highlights: ", get_podcast_highlights.call(podcast_details['episode_transcript']))

Overwriting podcast_backend.py


In [46]:
!modal run podcast_backend.py --url https://access.acast.com/rss/d556eb54-6160-4c85-95f4-47d9f5216c49 --path "."

In [50]:
!modal deploy podcast_backend.py

In [52]:
# Trying to call the deployed function from another python session
import modal
f = modal.Function.lookup("corise-podcast-project", "process_podcast")
output = f.call('https://feeds.npr.org/510289/podcast.xml', '')

C:\Users\Jeffrey.Roth.NMES\AppData\Local\Temp\ipykernel_23880\1431846519.py:4: DeprecationError: 2023-08-16: `f.call(...)` is deprecated. It has been renamed to `f.remote(...)`
  output = f.call('https://feeds.npr.org/510289/podcast.xml', '')


In [54]:
import json
with open("podcast-3.json", "w") as outfile:
  json.dump(output, outfile)

# Part 3: Deploying the front-end application

In [55]:
%%writefile podcast_frontend.py
import streamlit as st
import modal
import json
import os

def main():
    st.title("Newsletter Dashboard")

    available_podcast_info = create_dict_from_json_files('.')

    # Left section - Input fields
    st.sidebar.header("Podcast RSS Feeds")

    # Dropdown box
    st.sidebar.subheader("Available Podcasts Feeds")
    selected_podcast = st.sidebar.selectbox("Select Podcast", options=available_podcast_info.keys())

    if selected_podcast:

        podcast_info = available_podcast_info[selected_podcast]

        # Right section - Newsletter content
        st.header("Newsletter Content")

        # Display the podcast title
        st.subheader("Episode Title")
        st.write(podcast_info['podcast_details']['episode_title'])

        # Display the podcast summary and the cover image in a side-by-side layout
        col1, col2 = st.columns([7, 3])

        with col1:
            # Display the podcast episode summary
            st.subheader("Podcast Episode Summary")
            st.write(podcast_info['podcast_summary'])

        with col2:
            st.image(podcast_info['podcast_details']['episode_image'], caption="Podcast Cover", width=300, use_column_width=True)

        # Display the podcast guest and their details in a side-by-side layout
        col3, col4 = st.columns([3, 7])

        with col3:
            st.subheader("Podcast Guest")
            st.write(podcast_info['podcast_guest']['name'])

        with col4:
            st.subheader("Podcast Guest Details")
            st.write(podcast_info["podcast_guest"]['summary'])

        # Display the five key moments
        st.subheader("Key Moments")
        key_moments = podcast_info['podcast_highlights']
        for moment in key_moments.split('\n'):
            st.markdown(
                f"<p style='margin-bottom: 5px;'>{moment}</p>", unsafe_allow_html=True)

    # User Input box
    st.sidebar.subheader("Add and Process New Podcast Feed")
    url = st.sidebar.text_input("Link to RSS Feed")

    process_button = st.sidebar.button("Process Podcast Feed")
    st.sidebar.markdown("**Note**: Podcast processing can take upto 5 mins, please be patient.")

    if process_button:

        # Call the function to process the URLs and retrieve podcast guest information
        podcast_info = process_podcast_info(url)

        # Right section - Newsletter content
        st.header("Newsletter Content")

        # Display the podcast title
        st.subheader("Episode Title")
        st.write(podcast_info['podcast_details']['episode_title'])

        # Display the podcast summary and the cover image in a side-by-side layout
        col1, col2 = st.columns([7, 3])

        with col1:
            # Display the podcast episode summary
            st.subheader("Podcast Episode Summary")
            st.write(podcast_info['podcast_summary'])

        with col2:
            st.image(podcast_info['podcast_details']['episode_image'], caption="Podcast Cover", width=300, use_column_width=True)

        # Display the podcast guest and their details in a side-by-side layout
        col3, col4 = st.columns([3, 7])

        with col3:
            st.subheader("Podcast Guest")
            st.write(podcast_info['podcast_guest']['name'])

        with col4:
            st.subheader("Podcast Guest Details")
            st.write(podcast_info["podcast_guest"]['summary'])

        # Display the five key moments
        st.subheader("Key Moments")
        key_moments = podcast_info['podcast_highlights']
        for moment in key_moments.split('\n'):
            st.markdown(
                f"<p style='margin-bottom: 5px;'>{moment}</p>", unsafe_allow_html=True)

def create_dict_from_json_files(folder_path):
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
    data_dict = {}

    for file_name in json_files:
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as file:
            podcast_info = json.load(file)
            podcast_name = podcast_info['podcast_details']['podcast_title']
            # Process the file data as needed
            data_dict[podcast_name] = podcast_info

    return data_dict

def process_podcast_info(url):
    f = modal.Function.lookup("corise-podcast-project", "process_podcast")
    output = f.call(url, '')
    return output

if __name__ == '__main__':
    main()

Overwriting podcast_frontend.py


In [56]:
%%writefile requirements.txt
streamlit
modal

Overwriting requirements.txt
