# News Podcaster Demo
Use OpenAI Assistant to generate a script then convert it into audio scripts


In [1]:
from dotenv import load_dotenv
load_dotenv()

True

### Fetch new articles

In [2]:
import requests
import json
import os
import random

news_api = os.getenv("NEWS_API")

categories = [
    "business",
    "entertainment",
    "general",
    "health",
    "science",
    "sports",
    "technology",
]

countries = ["nz", "au", "us", "hk"]


category = random.choice(categories)
country = random.choice(countries)

print(country)
print(category)

url = f"https://newsapi.org/v2/top-headlines?category={category}&country={country}&apiKey={news_api}"
response = requests.get(url)
response_json = response.json()
print(json.dumps(response_json, indent=4))

au
sports
{
    "status": "ok",
    "totalResults": 68,
    "articles": [
        {
            "source": {
                "id": "google-news",
                "name": "Google News"
            },
            "author": "news.com.au",
            "title": "$108m star\u2019s nightmare reaches new low - news.com.au",
            "description": null,
            "url": "https://news.google.com/rss/articles/CBMinQFodHRwczovL3d3dy5uZXdzLmNvbS5hdS9zcG9ydC9iYXNrZXRiYWxsL25iYS9qYW1lcy1oYXJkZW5zLWNsaXBwZXJzLW5pZ2h0bWFyZS1nb2VzLWZyb20tYmFkLXRvLWRvd25yaWdodC1ob3JyaWZpYy9uZXdzLXN0b3J5LzI3ZjA4YTZiMmRiZmFjMzY1Yzc5NGE1MTcyMTU5NjA40gEA?oc=5",
            "urlToImage": null,
            "publishedAt": "2023-11-13T03:57:23Z",
            "content": null
        },
        {
            "source": {
                "id": "google-news",
                "name": "Google News"
            },
            "author": "news.com.au",
            "title": "New Bomber challenges backline narrative - news.com.au",
   

In [3]:
from pydantic import BaseModel, Field
from typing import Optional

# Define the NewsArticle model
class NewsArticle(BaseModel):
    """
    NewsArticle model represents an article fetched from the news API.
    """
    title: str
    url: str
    description: Optional[str] = Field(None)
    content: Optional[str] = Field(None)



In [4]:
from bs4 import BeautifulSoup
articles = []

for article in response_json['articles']:
    article_url = article['url']
    article_response = requests.get(article_url)
    soup = BeautifulSoup(article_response.text, 'html.parser')
    content = soup.find_all('p')
    article_content = ' '.join([p.text for p in content])
    article_obj = NewsArticle(title=article['title'], url=article['url'], description=article['description'], content=article_content)
    articles.append(article_obj)



In [5]:
from markdownify import MarkdownConverter

for article in articles:
    article.title = MarkdownConverter(heading_style="atx").convert(article.title)
    article.description = MarkdownConverter(heading_style="atx").convert(article.description) if article.description else ""
    article.content = MarkdownConverter(heading_style="atx").convert(article.content) if article.content else ""

print(len(articles))
print(articles[0])

20
title='$108m star’s nightmare reaches new low - news.com.au' url='https://news.google.com/rss/articles/CBMinQFodHRwczovL3d3dy5uZXdzLmNvbS5hdS9zcG9ydC9iYXNrZXRiYWxsL25iYS9qYW1lcy1oYXJkZW5zLWNsaXBwZXJzLW5pZ2h0bWFyZS1nb2VzLWZyb20tYmFkLXRvLWRvd25yaWdodC1ob3JyaWZpYy9uZXdzLXN0b3J5LzI3ZjA4YTZiMmRiZmFjMzY1Yzc5NGE1MTcyMTU5NjA40gEA?oc=5' description='' content='An NBA star is being torn to shreds by fans after dishing up a horror display that has raised serious questions around the league. James Harden’s nightmare in Los Angeles reached an even uglier depths on Monday morning as the Clippers took on the Memphis Grizzlies. Against the worst side in the Western Conference, many expected the Clippers to snap their losing-streak. Watch an average of 9 NBA Regular Season games per week LIVE on ESPN, available via Kayo. Join Kayo now and start streaming instantly > But things went from bad to worse for the Clippers with Harden delivering his worst outing since joining the team in a high-profile tra

In [6]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4-turbo")

total_token = len(encoding.encode(str(articles)))

print(f"total_token: {total_token}") 
print(f"total estimated cost: {total_token/1000*0.01}")

total_token: 20897
total estimated cost: 0.20897


### Generate characters

In [7]:
from pydantic import BaseModel, Field


# Updating the Pydantic model to include a description field
class PodcastCharacter(BaseModel):
    """A model representing a podcast character"""

    default_voice: str = Field(..., description="Default voice for the podcaster")
    voice_options: list[str] = Field(
        ..., description="Available voice_options options of the podcaster"
    )
    name: str = Field(..., description="Name of the podcaster")
    role: str = Field(
        ..., description="The primary role of the character in the podcast"
    )
    interest: str = Field(..., description="Main area of interest or expertise")
    political_view: str = Field(..., description="Political leanings of the character")
    specialty: str = Field(..., description="Specific area of focus or specialty")
    description: str = Field(..., description="A brief description about the character")


# Creating instances of the PodcastCharacter model with descriptions for each character
alexa_reef = PodcastCharacter(
    default_voice="Nova",
    voice_options=["Shimmer", "Nova"],
    name="Alexa Reef",
    role="Environmental Journalist",
    interest="Marine Biology",
    political_view="Left-Leaning",
    specialty="Climate Change and Conservation",
    description="Passionate about environmental issues, Alexa brings a scientific perspective to discussions on climate change and sustainability.",
)

raj_cybertech = PodcastCharacter(
    default_voice="Echo",
    voice_options=["Alloy", "Echo", "Fable", "Onyx"],
    name="Raj Cybertech",
    role="Technology Reporter",
    interest="Tech Trends",
    political_view="Moderate",
    specialty="Impact of Technology on Society",
    description="With a keen eye on emerging technologies, Raj delves into how tech innovations influence modern society and economy.",
)

michael_neuro_johnson = PodcastCharacter(
    default_voice="Fable",
    voice_options=["Alloy", "Echo", "Fable", "Onyx"],
    name="Michael Neuro Johnson",
    role="Sports Commentator",
    interest="Sports and Fitness",
    political_view="Right-Leaning",
    specialty="Personal Responsibility in Sports",
    description="A former athlete, Michael offers insights into the sports world, emphasizing personal achievement and fitness.",
)

lena_logic = PodcastCharacter(
    default_voice="Shimmer",
    voice_options=["Shimmer", "Nova"],
    name="Lena Logic",
    role="Investigative Journalist",
    interest="International Relations",
    political_view="Balanced",
    specialty="World News and Political Affairs",
    description="Lena provides a nuanced and analytical perspective on global political affairs, valuing balanced reporting.",
)

elijah_byte = PodcastCharacter(
    default_voice="Onyx",
    voice_options=["Alloy", "Echo", "Fable", "Onyx"],
    name="Elijah Byte",
    role="Political Commentator",
    interest="Economics",
    political_view="Conservative",
    specialty="Economic and Political Debate",
    description="Elijah offers a conservative viewpoint on political and economic issues, often sparking lively debates.",
)

sophia_algorithm = PodcastCharacter(
    default_voice="Nova",
    voice_options=["Shimmer", "Nova"],
    name="Sophia Algorithm",
    role="Cultural Critic",
    interest="Social Justice",
    political_view="Progressive",
    specialty="Social and Cultural Impact",
    description="Sophia addresses social and cultural topics with a progressive lens, advocating for social justice and inclusivity.",
)

david_data = PodcastCharacter(
    default_voice="Alloy",
    voice_options=["Alloy", "Echo", "Fable", "Onyx"],
    name="David Data",
    role="Financial Analyst",
    interest="Market Trends",
    political_view="Centrist",
    specialty="Economic News Analysis",
    description="David brings a data-driven approach to economic analysis, focusing on market trends and financial policies.",
)

nora_neural = PodcastCharacter(
    default_voice="Shimmer",
    voice_options=["Shimmer", "Nova"],
    name="Nora Neural",
    role="Freelance Journalist",
    interest="Human Rights",
    political_view="Humanitarian",
    specialty="Global Health Issues",
    description="Nora highlights the human stories behind the news, focusing on human rights and global health concerns.",
)

# List of character instances
characters = [
    alexa_reef,
    raj_cybertech,
    michael_neuro_johnson,
    lena_logic,
    elijah_byte,
    sophia_algorithm,
    david_data,
    nora_neural,
]

characters  # Displaying the list of character instances with descriptions

[PodcastCharacter(default_voice='Nova', voice_options=['Shimmer', 'Nova'], name='Alexa Reef', role='Environmental Journalist', interest='Marine Biology', political_view='Left-Leaning', specialty='Climate Change and Conservation', description='Passionate about environmental issues, Alexa brings a scientific perspective to discussions on climate change and sustainability.'),
 PodcastCharacter(default_voice='Echo', voice_options=['Alloy', 'Echo', 'Fable', 'Onyx'], name='Raj Cybertech', role='Technology Reporter', interest='Tech Trends', political_view='Moderate', specialty='Impact of Technology on Society', description='With a keen eye on emerging technologies, Raj delves into how tech innovations influence modern society and economy.'),
 PodcastCharacter(default_voice='Fable', voice_options=['Alloy', 'Echo', 'Fable', 'Onyx'], name='Michael Neuro Johnson', role='Sports Commentator', interest='Sports and Fitness', political_view='Right-Leaning', specialty='Personal Responsibility in Sports

### Generate guidelines

In [8]:
from typing import List

# Define the model using Pydantic
class GuidelinePoint(BaseModel):
    """
    A model representing a guideline point for the podcast.
    Each guideline point has a category and a list of points.
    """
    category: str
    points: List[str]

# Creating instances of the model
guidelines = [
    GuidelinePoint(
        category="News and Current Events",
        points=[
            "Focus on the most recent and impactful stories.",
            "Provide a brief background for context, especially for ongoing issues.",
            "Highlight the implications of the news on society, economy, or specific communities.",
            "Include diverse perspectives and avoid bias in reporting."
        ]
    ),
    GuidelinePoint(
        category="Technology and Innovations",
        points=[
            "Explain complex technical concepts in layman's terms.",
            "Discuss the potential impact of new technologies on everyday life.",
            "Explore both the benefits and challenges associated with technological advancements.",
            "Include expert opinions or predictions about future trends."
        ]
    ),
    GuidelinePoint(
        category="Environmental and Sustainability Topics",
        points=[
            "Emphasize the relevance of environmental issues to the listener's daily life.",
            "Discuss both local and global perspectives on sustainability.",
            "Offer practical tips or solutions for living a more sustainable lifestyle.",
            "Highlight success stories and positive developments in environmental conservation."
        ]
    ),
    GuidelinePoint(
        category="Cultural and Social Issues",
        points=[
            "Address cultural trends, social movements, and major events in arts and entertainment.",
            "Respect and represent diverse viewpoints and cultural backgrounds.",
            "Discuss the broader societal impact of cultural phenomena.",
            "Incorporate historical context where relevant to understand current trends."
        ]
    )
    # More categories can be added similarly
]

guidelines # Display the guidelines list



[GuidelinePoint(category='News and Current Events', points=['Focus on the most recent and impactful stories.', 'Provide a brief background for context, especially for ongoing issues.', 'Highlight the implications of the news on society, economy, or specific communities.', 'Include diverse perspectives and avoid bias in reporting.']),
 GuidelinePoint(category='Technology and Innovations', points=["Explain complex technical concepts in layman's terms.", 'Discuss the potential impact of new technologies on everyday life.', 'Explore both the benefits and challenges associated with technological advancements.', 'Include expert opinions or predictions about future trends.']),
 GuidelinePoint(category='Environmental and Sustainability Topics', points=["Emphasize the relevance of environmental issues to the listener's daily life.", 'Discuss both local and global perspectives on sustainability.', 'Offer practical tips or solutions for living a more sustainable lifestyle.', 'Highlight success st

### Generate a show

In [9]:
# Define the Show model
class PodcastShow(BaseModel):
    """
    A model representing a podcast show. It includes the show description, host, guests, guidelines, and news articles.
    """
    show_description: str
    host: PodcastCharacter
    guests: List[PodcastCharacter]
    guideline: GuidelinePoint
    news_articles: List[NewsArticle]


In [10]:
from llama_index.llms import OpenAI
from llama_index.program import OpenAIPydanticProgram

prompt_template_str = """
Generate a podcast show with a host, a list of guests (min 1 guest, max 2 guests), guidelines and news articles. The show should pick the relevant guest, host and guidelines based on the news articles provided.
Use the following available options:
Host: {host}

Guests: {guests}

Guidelines: {guidelines}

News Articles: {articles}
"""
program = OpenAIPydanticProgram.from_defaults(
    llm=OpenAI(model="gpt-4-1106-preview"),
    output_cls=PodcastShow,
    prompt_template_str=prompt_template_str,
    verbose=True,
)

In [11]:
output = program(host=characters, guests=characters, guidelines=guidelines, articles=articles)


Function call: PodcastShow with args: {
  "show_description": "Welcome to 'Tech & Tides', a podcast where we dive into the latest technological advancements and explore their impact on environmental conservation. Join us as we discuss innovative solutions, challenges, and the future of sustainability.",
  "host": {
    "default_voice": "Nova",
    "voice_options": [
      "Shimmer",
      "Nova"
    ],
    "name": "Alexa Reef",
    "role": "Environmental Journalist",
    "interest": "Marine Biology",
    "political_view": "Left-Leaning",
    "specialty": "Climate Change and Conservation",
    "description": "Passionate about environmental issues, Alexa brings a scientific perspective to discussions on climate change and sustainability."
  },
  "guests": [
    {
      "default_voice": "Echo",
      "voice_options": [
        "Alloy",
        "Echo",
        "Fable",
        "Onyx"
      ],
      "name": "Raj Cybertech",
      "role": "Technology Reporter",
      "interest": "Tech Trends

In [None]:
# import random

# host = characters[random.randint(0, len(characters)-1)]
# guest = characters[random.randint(0, len(characters)-1)]

# while guest == host:
#     guest = characters[random.randint(0, len(characters)-1)]


# print(host)
# print(guest)

# Updating the function to include the character's name in the description

# def generate_description(character):
#     return (
#         f"{character.name}: A {character.role} with an interest in {character.interest}. "
#         f"Politically, she is {character.political_view} and specializes in {character.specialty}. "
#         f"Description: {character.description}"
#     )

# # Description for Alexa_Reef with her name included
# host_str = generate_description(host)
# guest_str = generate_description(guest)

# print(host_str)
# print(guest_str)

In [13]:
template = f"""You help write podcast scripts for a show called "Podgen". Your objective is to write a podcast script based on the show's description, characters, guideline and the articles.
Show description: {output.show_description}
----
Character details:
Host: {output.host} 
Guest(s): {output.guests}
----
Distill these stories into easy-to-understand narratives, following these guidelines:
{output.guideline}
----
The script should be a conversation between the interviewer (Host) and the guest(s), with no additional information like headers or subheaders.Host and Guest dialogues should be clearly marked:
Host: <Message>
Guest: <Message>
----
Cite your sources accurately next to the statement using the following format: [URL](source link).
The top news articles today are:
{output.news_articles}
"""

In [14]:
from llama_index.llms import OpenAI

response = OpenAI(model="gpt-4-1106-preview", temperature=0.1).complete(template)

print(response)

```markdown
# Tech & Tides Podcast Episode: "Harnessing Tech for a Greener Tomorrow"

## Introduction
Host: Welcome to 'Tech & Tides', the podcast that brings you to the intersection of technology and environmental conservation. I'm your host, Alexa Reef, and today we have a special guest, Raj Cybertech, a seasoned technology reporter with a passion for how tech innovations shape our society. Hello, Raj, and welcome to the show!

Guest: Hi Alexa, it's great to be here. I'm excited to dive into today's topics and explore the future of tech and sustainability.

## The Convergence of Technology and Environmental Conservation
Host: Raj, let's start with the basics. In your experience, how can technological advancements contribute to environmental conservation?

Guest: Absolutely, Alexa. When we talk about technology, we often think about gadgets and gizmos, but it's so much more than that. For instance, advancements in renewable energy tech like solar panels and wind turbines are making it

In [None]:
def split_text_by_speaker(text):
    speaker_text = {}

    lines = [line for line in text.strip().split("\n") if line != '']
    for i, line in enumerate(lines):
        if ": " in line:
            speaker, speech = line.split(": ", 1)
            if "Host" in speaker:
                speaker_text[str(i) + '_host'] = speech
            else:
                speaker_text[str(i) + '_guest'] = speech

    return speaker_text

speaker_map = split_text_by_speaker(str(response))

In [None]:
speaker_map

In [None]:
def strip_source_and_url_from_string(speaker_map):
    import re
    for key, value in speaker_map.items():
        speaker_map[key] = re.sub(r'\[Source\]\(.*\)', '', value).strip()
    return speaker_map

speaker_map = strip_source_and_url_from_string(speaker_map)
speaker_map


In [None]:
import os
import requests
import io
import tempfile

def text_to_audio(file_name, text, voice):
    response = requests.post(
        "https://api.openai.com/v1/audio/speech",
        headers={
            "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
        },
        json={
            "model": "tts-1",
            "input": text,
            "voice": f'{voice if voice else "onyx"}',
        },
    )

    # audio_file_path = f"{file_name}.wav"
    # with open(audio_file_path, "wb") as audio_file:
    #     for chunk in response.iter_content(chunk_size=1024 * 1024):
    #         audio_file.write(chunk)

    # # To play the audio in Jupyter after saving
    # Audio(audio_file_path)
    # Check if the request was successful
    if response.status_code != 200:
        raise Exception("Request failed with status code")
    # ...
    # Create an in-memory bytes buffer
    audio_bytes_io = io.BytesIO()

    # Write audio data to the in-memory bytes buffer
    for chunk in response.iter_content(chunk_size=1024 * 1024):
        audio_bytes_io.write(chunk)

    # Important: Seek to the start of the BytesIO buffer before returning
    audio_bytes_io.seek(0)

    # Save audio to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, prefix=file_name, suffix=".wav") as tmpfile:
        for chunk in response.iter_content(chunk_size=1024 * 1024):
            tmpfile.write(chunk)
        audio_filename = tmpfile.name

    return audio_filename, audio_bytes_io

In [None]:
audio_bytes_combined = io.BytesIO()
temp_files = []
for item in speaker_map.items():
    if 'host' in item[0]:
        audio_filename, audio_bytes_io = text_to_audio(item[0], item[1], "alloy")
    elif 'guest' in item[0]:
        audio_filename, audio_bytes_io = text_to_audio(item[0], item[1], "nova")
    audio_bytes_combined.write(audio_bytes_io.read())
    temp_files.append(audio_filename)
audio_bytes_combined.seek(0)

# Delete all the temporary files after audio combined
for temp_file in temp_files:
    os.remove(temp_file)

In [None]:
print(os.getcwd())
os.chdir("../")
print(os.getcwd())

In [None]:
# Save the combined audio to a file in the output folder
# Check if the output directory exists, if not, create it
from datetime import date

output_dir = f'output/daily_news_{date.today()}'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

with open(os.path.join(output_dir, 'openai_tts_combined_audio.wav'), 'wb') as f:
    f.write(audio_bytes_combined.read())
audio_bytes_combined.seek(0)