<a href="https://colab.research.google.com/github/JaganK2Commit/variety/blob/master/BedTimeStory_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### What should be the video be about ?


In [100]:
TOPIC = "script for a Youtube video for a 5-year-old bed time story" # make sure it has "how to _"
NARRATOR_ADJECTIVES = "story teller"
MUSIC_STYLE = "bed time story for 5 year old"

### Install the dependencies

In [3]:
!pip install replicate
!pip install requests
!pip install openai
!pip install langchain
!pip install moviepy
!pip install ffmpeg --upgrade
from google.colab import output
output.clear()

In [4]:
import os
import openai
import time
import numpy as np
import replicate
import json

In [5]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, SimpleSequentialChain, SequentialChain, TransformChain
from langchain.llms import OpenAI, Replicate
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

## Aquire Tokens for Replicate and OpenAI

In [6]:
# get your token from https://replicate.com/account
from getpass import getpass

REPLICATE_API_TOKEN = getpass()
os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN

··········


In [7]:
# get your key from https://platform.openai.com/account/api-keys
OPENAI_API_KEY = getpass()
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

··········


## Create the LongChain

In [101]:
# LLMChain to write a a script for our how-to video.
topic_template = """
Generate a {topic}.
The script should be a short story with a clear introduction and conclusion, structured into 5 paragraphs.Each paragraph is not more than 50 words
For each paragraph, please provide a corresponding visual description which can be used for painting the visual scene.
The visual description should describe appearances, actions, and emotions without mentioning specific character or object names.
Output of the story should be in JSON format with the keys "title," "introduction," "paragraphs," and "visual_descriptions."

You are a {narrator_adjectives} narrator.
"""

system_message_prompt = SystemMessage(content="You are a helpful assistant that enthusiastically teaches people new topics.")
human_message_prompt = HumanMessagePromptTemplate(prompt=PromptTemplate(
                                                  template=topic_template,
                                                  input_variables=["topic", "narrator_adjectives"]))

# create the initial script
chat = ChatOpenAI(temperature=0.9, model_name="gpt-3.5-turbo")
chat_prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
script_chain = LLMChain(llm=chat, prompt=chat_prompt_template, output_key='script')

script = script_chain.run({"topic": TOPIC, "narrator_adjectives": NARRATOR_ADJECTIVES, "music_style": MUSIC_STYLE})
# data = json.loads(result)
# script = data['paragraphs']
# video_descriptions = data['video_descriptions']

print(script)
# print(video_descriptions)

{
  "title": "The Moon and the Stars",
  "introduction": "Once upon a time, in a quiet village, there was a little girl called Lily.",
  "paragraphs": [
    "On a clear night, Lily looked up at the sky and saw a bright round object shining.",
    "It was the moon! It had a gentle glow and seemed to smile down at Lily.",
    "Lily wondered if the moon had any friends and looked for other shining objects.",
    "She spotted countless stars, twinkling like tiny diamonds scattered across the sky.",
    "Lily felt happy and peaceful, knowing that the moon and the stars were always watching over her."
  ],
  "visual_descriptions": [
    "A peaceful village with cozy houses and a girl named Lily gazing at the night sky.",
    "A clear night with a big round moon in the sky, beaming with a gentle glow.",
    "Lily searching the sky for other shining objects, her face filled with curiosity.",
    "A dark sky filled with countless sparkling stars, creating a magical atmosphere.",
    "Lily with 

In [102]:
# LLMChain to write a title for our video
llm = OpenAI(temperature=.9)
template = """Please come up with a creative and zany title for the below how-to video script.
Puns are encouraged. Don't include quotations (") in the output.

Script:
{script}
Title: """
prompt_template = PromptTemplate(input_variables=["script"], template=template)
title_chain = LLMChain(llm=llm, prompt=prompt_template, output_key='title')

title = title_chain.run({"script": script})
print(title)



 "Shining Stars and Smiling Moons: A Little Girl's Tale"


In [103]:
# LLMChain to create the replicate predictions for our text-to-image model
def transform_func(inputs: dict) -> dict:
  video_model = replicate.models.get('ai-forever/kandinsky-2')
  video_version = video_model.versions.get("601eea49d49003e6ea75a11527209c4f510a93e2112c969d548fbb45b9c4f19f")
  descriptions = json.loads(inputs['script'])['visual_descriptions']

  predictions = []

  for description in descriptions:
      print(f"Creating video prediction for '{description}'...")
      video_prediction = replicate.predictions.create(version=video_version,
                                                      input={"prompt": description, "prior_steps": '5', "guidance_scale": 4, "num_inference_steps": 100, "prior_cf_scale":4,
                                                             "scheduler": "p_sampler"})
      predictions.append(video_prediction)
  return {'video_predictions': predictions}

video_predictions_chain = TransformChain(input_variables=['script'], output_variables=['video_predictions'], transform=transform_func)

# video_predictions = video_predictions_chain.run({"script": script})
# print(video_predictions)

In [104]:
# LLMChain to create the replicate predictions for our bark model
def transform_func(inputs: dict) -> dict:
  audio_model = replicate.models.get("suno-ai/bark")
  audio_version = audio_model.versions.get("b76242b40d67c76ab6742e987628a2a9ac019e11d56ab96c4e91ce03b79b2787")
  parsed_script = json.loads(inputs['script'])['paragraphs']

  predictions = []

  for line in parsed_script:
      print(f"Creating audio prediction for '{line}''...")
      audio_prediction = replicate.predictions.create(version=audio_version,
                                                      input={"prompt": line, "history_prompt": "announcer"})
      predictions.append(audio_prediction)
  return {'audio_predictions': predictions}

audio_predictions_chain = TransformChain(input_variables=['script'], output_variables=['audio_predictions'], transform=transform_func)

# audio_predictions = audio_predictions_chain.run({"script": script})
# print(audio_predictions)

In [105]:
# LLMChain to create the cover image
llm = OpenAI(temperature=.9)
template = """
          Create a visual description artstation, hd, dramatic lighting, detailed for the following script.
          "{script}""
          """
prompt_template = PromptTemplate(input_variables=["script"], template=template)

text2image = Replicate(model="ai-forever/kandinsky-2:601eea49d49003e6ea75a11527209c4f510a93e2112c969d548fbb45b9c4f19f",
                       input={"prior_steps": '5', "guidance_scale": 4, "num_inference_steps": 100, "prior_cf_scale":4,
                                                        "scheduler": "p_sampler"})

# text2image = Replicate(model="stability-ai/stable-diffusion:db21e45d3f7023abc2a46ee38a23973f6dce16bb082a930b0c49861f96d1e5bf",
#                        input={'image_dimensions': '512x512', "negative_prompt": "text, writing"})
title_image_chain = LLMChain(llm=text2image, prompt=prompt_template, output_key='title_image')

# title_image = title_image_chain.run({"script": script})
# print(title_image)

In [106]:
# LLMChain to write the thank you note at the end of our video
template = """Please come up with a creative and zany ending quote from our narrator.
The script is what the narrator just read. We want to close things out.

Make sure you add a "And don't forget to like and subscribe!" to the end of your output.

You are a {narrator_adjectives} narrator.

Script:
{script}
Ending quote:
"""

system_message_prompt = SystemMessage(content="You are a helpful assistant.")
human_message_prompt = HumanMessagePromptTemplate(prompt=PromptTemplate(
                                                  template=template,
                                                  input_variables=["script", "narrator_adjectives"]))
chat_prompt_template = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
ending_quote_chain = LLMChain(llm=chat, prompt=chat_prompt_template, output_key='ending_quote')

# ending_quote = ending_quote_chain.run({"script": script, "narrator_adjectives": NARRATOR_ADJECTIVES})
# print(ending_quote)

In [107]:
# LLMChain to create the prediction that generates the audio for the thank you note
def transform_func(inputs: dict) -> dict:
  audio_model = replicate.models.get("suno-ai/bark")
  audio_version = audio_model.versions.get("b76242b40d67c76ab6742e987628a2a9ac019e11d56ab96c4e91ce03b79b2787")
  ending_quote_prediction = replicate.predictions.create(version=audio_version,
                                                      input={"prompt": inputs['ending_quote'], "history_prompt": "announcer"})
  return {'ending_quote_prediction': ending_quote_prediction}

ending_quote_prediction_chain = TransformChain(input_variables=['ending_quote'], output_variables=['ending_quote_prediction'], transform=transform_func)

# ending_quote_prediction = ending_quote_prediction_chain.run({"ending_quote": ending_quote})
# print(ending_quote_prediction)

In [108]:
# LLMChain to create the prediction that generates the audio for the title
def transform_func(inputs: dict) -> dict:
  audio_model = replicate.models.get("suno-ai/bark")
  audio_version = audio_model.versions.get("b76242b40d67c76ab6742e987628a2a9ac019e11d56ab96c4e91ce03b79b2787")
  title_prediction = replicate.predictions.create(version=audio_version,
                                                      input={"prompt": inputs['title'], "history_prompt": "announcer"})
  return {'title_audio_prediction': title_prediction}

title_audio_prediction_chain = TransformChain(input_variables=['title'], output_variables=['title_audio_prediction'], transform=transform_func)

# title_audio_prediction = title_audio_prediction_chain.run({"title": title})
# print(title_audio_prediction)

In [109]:
# LLMChain to create the prediction that generates the background music
def transform_func(inputs: dict) -> dict:
  model = replicate.models.get("riffusion/riffusion")
  version = model.versions.get("8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05")
  music_prediction = replicate.predictions.create(version=version, input={"prompt": inputs['music_style']})

  return {'music_prediction': music_prediction}

music_prediction_chain = TransformChain(input_variables=['music_style'], output_variables=['music_prediction'], transform=transform_func)

music_prediction = music_prediction_chain.run({"music_style": MUSIC_STYLE})
print(music_prediction)

id='nk2egrrbbzeunuiu7jerbkevq4' error=None input={'prompt': 'bed time story for 5 year old'} logs='' output=None status='starting' version=Version(id='8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05', created_at=datetime.datetime(2022, 12, 16, 7, 48, 40, 890140, tzinfo=datetime.timezone.utc), cog_version='0.6.1', openapi_schema={'info': {'title': 'Cog', 'version': '0.1.0'}, 'paths': {'/': {'get': {'summary': 'Root', 'responses': {'200': {'content': {'application/json': {'schema': {}}}, 'description': 'Successful Response'}}, 'operationId': 'root__get'}}, '/predictions': {'post': {'summary': 'Predict', 'responses': {'200': {'content': {'application/json': {'schema': {'$ref': '#/components/schemas/PredictionResponse'}}}, 'description': 'Successful Response'}, '422': {'content': {'application/json': {'schema': {'$ref': '#/components/schemas/HTTPValidationError'}}}, 'description': 'Validation Error'}}, 'description': 'Run a single prediction on the model', 'operationId': '

## Run the chain

Now, let's execute the chain we created. This is relatively fast, because the chains that create long-running predictions (like the video_predictions_chain) make asynchronous calls to the Replicate API.

In [110]:
# Run the chain
overall_chain = SequentialChain(chains=[script_chain,
                                        title_chain,
                                        video_predictions_chain,
                                        audio_predictions_chain,
                                        ending_quote_chain,
                                        ending_quote_prediction_chain,
                                        title_image_chain,
                                        title_audio_prediction_chain,
                                        music_prediction_chain
                                        ], input_variables=['topic', 'narrator_adjectives', 'music_style'], output_variables=['script', 'title', 'video_predictions', 'audio_predictions', 'ending_quote', 'title_image', 'ending_quote_prediction', 'title_audio_prediction', 'music_prediction'], verbose=True)
chain_output = overall_chain({"topic": TOPIC, "narrator_adjectives": NARRATOR_ADJECTIVES, "music_style": MUSIC_STYLE})



[1m> Entering new  chain...[0m
Creating video prediction for 'A small glowing creature flies amidst tall trees, leaving a trail of sparkling dust behind.'...
Creating video prediction for 'Animals of various sizes and shapes dance joyfully in a circle, their laughter echoing.'...
Creating video prediction for 'Water flows down a waterfall, creating a misty scene with moss-covered rocks below.'...
Creating video prediction for 'The forest floor is filled with giant mushrooms, each emitting a soft glow of different colors.'...
Creating video prediction for 'The sun sets, casting a warm glow over the trees, turning the sky into a breathtaking display of colors.'...
Creating audio prediction for 'A tiny sparkly creature with wings fluttered through the trees, spreading magic dust.''...
Creating audio prediction for 'The trees leaned and whispered secrets, as animals danced in a circle, their laughter filling the air.''...
Creating audio prediction for 'A shimmering waterfall cascaded d

In [111]:
# unpack outputs
script = chain_output['script']
title = json.loads(script)['title']
split_script = json.loads(script)['paragraphs']
video_descriptions = json.loads(script)['visual_descriptions']
video_predictions = chain_output['video_predictions']
audio_predictions = chain_output['audio_predictions']

# print(title)
# print(split_script)
# print(video_descriptions)
# print(video_predictions)
# print(audio_predictions)

In [112]:
# sanity check
assert len(split_script) == len(video_descriptions)

# ⏳ Wait for our async predictions to complete
Here's a helper to check in on our predictions. This usually takes a minute or two.

In [113]:
def all_done(predictions):
    return set([p.status for p in predictions]) == {'succeeded'}

In [114]:
all_predictions = chain_output['video_predictions'] + \
                  chain_output['audio_predictions'] + \
                  [chain_output['ending_quote_prediction']] + \
                  [chain_output['title_audio_prediction']] + \
                  [chain_output['music_prediction']]

In [115]:
done = False

while not done:
  [p.reload() for p in all_predictions]
  for p in all_predictions:
    print(f'https://replicate.com/p/{p.id}', p.status)
  done = all_done(all_predictions)
  time.sleep(2)
  output.clear()

print("Predictions complete")

Predictions complete


# 🪡 Stitch them all together!

In [116]:
video_urls = [v.output for v in video_predictions]
audio_urls = [a.output['audio_out'] for a in audio_predictions]
music_url = chain_output['music_prediction'].output['audio']
subtitles = split_script
title_image_url = chain_output['title_image']
title_audio_url = chain_output['title_audio_prediction'].output['audio_out']

video_urls


[['https://replicate.delivery/pbxt/qVIh0v5SX4Y8HRo3owQS6yvax84xf4oHS2EcIICQKhoQfhKRA/out_0.png'],
 ['https://replicate.delivery/pbxt/I9SEpXgIheXIP6LYyf9r8fYCbZS0YBYoyPo6Pc7vRmaE9DViA/out_0.png'],
 ['https://replicate.delivery/pbxt/pGWMfQfsFgnLukLnt7yZYrNOL0KfhH4Y9ReGivAac9Ul6HqEB/out_0.png'],
 ['https://replicate.delivery/pbxt/GloDdj0O1o4eMyZrTafuZ4tyfIBMHbw1pXQTJd8e04pu6HqEB/out_0.png'],
 ['https://replicate.delivery/pbxt/H2CcO4QDJ2IfFKqK19rVxO7etQjq7rs0SHhRO170OCsxeDViA/out_0.png']]

In [117]:
import requests
import os
import moviepy.editor as mp
import moviepy.video.fx.all as vfx
import textwrap
from moviepy.editor import *
from PIL import Image, ImageDraw, ImageFont
from io import BytesIO
import numpy as np


# Download video and audio files
video_files = []
audio_files = []
# for i, url in enumerate(video_urls):
#     response = requests.get(url)
#     video_filename = f"temp_video{i}.mp4"
#     with open(video_filename, "wb") as video_file:
#         video_file.write(response.content)
#     video_files.append(video_filename)
fps = 12.0
for i, url in enumerate(video_urls):
    response = requests.get(url[0])
    image = Image.open(BytesIO(response.content))
    image_np = np.array(image)
    clip = mp.ImageSequenceClip([image_np], fps=fps)
    video_filename = f"temp_video{i}.mp4"
    clip.write_videofile(video_filename, codec='libx264', fps=fps)
    #with open(video_filename, "wb") as video_file:
        #video_file.write(response.content)
    video_files.append(video_filename)

for i, url in enumerate(audio_urls):
    response = requests.get(url)
    audio_filename = f"temp_audio{i}.mp3"
    with open(audio_filename, "wb") as audio_file:
        audio_file.write(response.content)
    audio_files.append(audio_filename)

# Load and process video and audio files

processed_videos = []
for i, audio_file in enumerate(audio_files):
    video = mp.VideoFileClip(video_files[i])
    audio = mp.AudioFileClip(audio_file)

    # Loop the video for the duration of the audio
    looped_video = mp.concatenate_videoclips([video] * int(audio.duration // video.duration + 1))

    # Set the audio of the video to the audio file
    video_with_audio = looped_video.set_audio(audio)
    processed_videos.append(video_with_audio)

# Concatenate all the processed videos
final_video = mp.concatenate_videoclips(processed_videos)

## The following adds the title image / narration to the video.
# Add this function to create the text image
def txt_image(img, txt, font_size, color):
    image = img.copy()
    draw = ImageDraw.Draw(image)
    draw.text((50, 50), txt, fill=(255, 255, 0))
    # font = ImageFont.load_default().font_variant(size=font_size)
    # draw.text((50, 50), txt, font=font, fill=color)
    return image

# Download and create the image clip
image_url = title_image_url
response = requests.get(image_url)
img = Image.open(BytesIO(response.content))

# Resize the image to match the video dimensions
img_resized = img.resize((1200, 900))

# Download the audio file
audio_url = chain_output['ending_quote_prediction'].output['audio_out']
response = requests.get(audio_url)
with open("temp_audio_ending.mp3", "wb") as audio_file:
    audio_file.write(response.content)

# Create the audio clip
audio_ending = AudioFileClip("temp_audio_ending.mp3")

# make title empty for now, couldn't figure out how to get it bigger
text = chain_output['title']
img_text = ImageClip(np.asarray(txt_image(img_resized, txt='', font_size=48, color="white")), duration=4)

# Set the audio of the image clip to the audio file and trim it to the same duration
img_text_audio_ending = mp.concatenate_videoclips([img_text] * int(audio_ending.duration // img_text.duration + 1))
img_text_audio_ending = img_text.set_audio(audio_ending)

# Download the title page audio file
audio_url = chain_output['title_audio_prediction'].output['audio_out']
response = requests.get(audio_url)
with open("temp_audio_title.mp3", "wb") as audio_file:
    audio_file.write(response.content)

# Create the audio clip
audio_beginning = AudioFileClip("temp_audio_title.mp3")

# Set the audio of the image clip to the audio file and trim it to the same duration
img_text_audio_beginning = mp.concatenate_videoclips([img_text] * int(audio_beginning.duration // img_text.duration + 1))
img_text_audio_beginning = img_text.set_audio(audio_beginning)

# Concatenate the image clip with the processed videos
width, height = processed_videos[0].size
title_video = img_text_audio_beginning.resize((width, height))
ending_video = img_text_audio_ending.resize((width, height))

processed_videos.insert(0, title_video)
processed_videos.append(ending_video)

final_video = concatenate_videoclips(processed_videos)

# Download the background audio file
bg_audio_url = music_url
response = requests.get(bg_audio_url)
with open("temp_bg_audio.mp3", "wb") as audio_file:
    audio_file.write(response.content)

# Create the background audio clip
bg_audio = AudioFileClip("temp_bg_audio.mp3")

# Calculate the duration of the final video
video_duration = final_video.duration

# Loop the background audio to match the final video's duration
bg_audio_looped = bg_audio.fx(afx.audio_loop, duration=video_duration)
bg_audio_looped = bg_audio_looped.volumex(0.5)

# Overlay the background audio with the audio from the final video
final_audio = CompositeAudioClip([final_video.audio, bg_audio_looped])

# Set the audio of the final video to the combined audio
final_video_with_bg_audio = final_video.set_audio(final_audio)

# Save the final video
final_video_with_bg_audio.write_videofile(f"how_to.mp4", codec='libx264', audio_codec='aac')

# Clean up temporary files
for video_file, audio_file in zip(video_files, audio_files):
    os.remove(video_file)
for audio_file in audio_files:
    os.remove(audio_file)

Moviepy - Building video temp_video0.mp4.
Moviepy - Writing video temp_video0.mp4





Moviepy - Done !
Moviepy - video ready temp_video0.mp4
Moviepy - Building video temp_video1.mp4.
Moviepy - Writing video temp_video1.mp4





Moviepy - Done !
Moviepy - video ready temp_video1.mp4
Moviepy - Building video temp_video2.mp4.
Moviepy - Writing video temp_video2.mp4





Moviepy - Done !
Moviepy - video ready temp_video2.mp4
Moviepy - Building video temp_video3.mp4.
Moviepy - Writing video temp_video3.mp4





Moviepy - Done !
Moviepy - video ready temp_video3.mp4
Moviepy - Building video temp_video4.mp4.
Moviepy - Writing video temp_video4.mp4





Moviepy - Done !
Moviepy - video ready temp_video4.mp4
Moviepy - Building video how_to.mp4.
MoviePy - Writing audio in how_toTEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video how_to.mp4





Moviepy - Done !
Moviepy - video ready how_to.mp4


In [118]:
#@title Watch the video
from IPython.display import HTML
from base64 import b64encode
mp4 = open(f'how_to.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)