# Tale to Scenes

In [1]:
!pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
import openai
from packaging import version

# Ensuring version
required_version = version.parse("1.1.1")
current_version = version.parse(openai.__version__)

if current_version < required_version:
    raise ValueError(f"Error: OpenAI version {openai.__version__}"
                     " is less than the required version 1.1.1")
else:
    print("OpenAI version is compatible.")

# -- Now we can get to it
from openai import OpenAI

OPENAI_SECRET_KEY = 'ENTER YOUR API KEY'

# pass key to the client
client = OpenAI(api_key = OPENAI_SECRET_KEY)

# read Tale.txt file
with open('Tale_short_version.txt', 'r') as file:
    tale = file.read()

prompt = f"Given a text file containing a story, split the text into scenes. Each scene should be about no less than 1 and no more than 5 sentences, and the splitting should be based on logical breaks in the scenario, as if it were a movie. Try to have around 5 scenes. Please provide me with a series of scenes without eliminating or rephrasing anything from the text: {tale}"

response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are a helpful assistant designed to split a text file based on logical breaks in the plot or scenery, and provide me with a list of scenes. Do not eliminate sentences and do not paraphrase them."},
        {"role": "user", "content": prompt},
    ]
)

OpenAI version is compatible.


In [6]:
scenes_txt = response.choices[0].message.content

In [7]:
import re

# Remove breaklines
input_string = re.sub(r'\n', ' ', scenes_txt)

# Extract scenes and text using regular expression
scene_pattern = re.compile(r'Scene \d+:')
scenes_and_text = re.split(scene_pattern, input_string)[1:]

# Create a map of Scene and Text
scene_map = {f"Scene {i+1}": text.strip() for i, text in enumerate(scenes_and_text)}

# Print the map
for scene, text in scene_map.items():
    print(f"{scene}:\n{text}\n")

Scene 1:
"In the heart of the ancient world, where time's tapestry was woven with threads of conquest and glory, there stood empires, each a living tale etched in the landscapes of time."

Scene 2:
"The Babylonian Empire: Picture the grandeur of the Euphrates River, its waters carving a lifeline through the desert. The Hanging Gardens, a verdant marvel suspended in the air, dripped with the fragrance of exotic blooms, creating an oasis in the heart of Babylon."

Scene 3:
"The Egyptian Empire: Amidst the golden dunes of the Sahara, the Nile River flowed, a celestial serpent nurturing the cradle of pharaonic might. Pyramids stood as timeless sentinels, their limestone surfaces gleaming in the eternal sun, and colossal statues of sphinxes guarded secrets buried in the sands of time."

Scene 4:
"The Persian Empire: On the vast plateaus of Persia, where the air shimmered with the fragrance of spices, the royal palaces of Persepolis emerged like jewels in the crown of Cyrus the Great. The Ha

In [8]:
import os
# Create a folder for each scene
for scene, text in scene_map.items():
    scene_folder = f"scenes/{scene}"
    os.makedirs(scene_folder, exist_ok=True)

# Scenes to image prompts

### Extract style by MAISA

In [None]:
import requests

url = "https://api.maisa.ai/v1/capabilities/summarize"

payload = {
    "format": "paragraph",
    "length": "long",
    "text": tale,
    "summary_hint": "Summarize the overall plot and style of the text."
}
headers = {
    "accept": "application/json",
    "content-type": "application/json",
    "X-API-Key": "ENTER YOUR API KEY"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

In [None]:
summary = response.json()['summary']

### Prompts by OpenAI

In [9]:
scene_prompt = {}
for scene_index, scene in scene_map.items():
    prompt = f"Visually and simply describe the scene: {scene}. Consider the context when describing the scene: {summary}. Do not erase any important details. Description should be limited to 350 characters."

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a prompt generator for image generation tool that draws scenes from text, that should generate prompt that is no longer than 350 characters."},
            {"role": "user", "content": prompt},
        ]
    )
    scene_prompt[scene_index] = response.choices[0].message.content
print(scene_prompt)

{'Scene 1': '"An ancient world scene showcasing prominent empires, each representing a story of conquest and glory. Their monumental buildings, landscapes and symbols of power are visible, captured in a tapestry of time."', 'Scene 2': '"View of the Babylonian Empire with the Euphrates River flowing through a desert. The Hanging Gardens are suspended in the air, brimming with lush plants and exotic blooms, serving as a fragrant oasis within Babylon."', 'Scene 3': 'A scene of the Egyptian Empire: Golden Sahara dunes surround the nourishing Nile River. Timeless pyramids stand with gleaming limestone surfaces under the eternal sun. Giant sphinx statues guard buried secrets in the sand.', 'Scene 4': '"A vast shimmering Persian plateau with royal palaces of Persepolis, resembling jewels, under Cyrus the Great\'s reign. The Hall of a Hundred Columns fills with whispers of dignitaries. Intricate reliefs on walls depict an empire that stretches from Aegean to Indus."', 'Scene 5': '"An expansive

# Scenes to Images

In [10]:
!pip install requests


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
API_KEY = "ENTER YOU BLOCkADE API"

In [12]:
import requests

# Endpoint URL
URL = "https://backend.blockadelabs.com/api/v1/skybox"

def send_image_requests(prompt):
# Data to send in the request body
    data = {
        "prompt": prompt,
        "skybox_style_id": 2
    }
    
    # Set headers including your API key
    headers = {
        "Content-Type": "application/json",
        "x-api-key": API_KEY,
    }
    
    # Send POST request
    response = requests.post(URL, headers=headers, json=data)
    
    # Check for successful response
    if response.status_code == 200:
        print("Success! Response:", response.json())
        return response.json()['obfuscated_id']
    else:
        print("Error:", response.status_code, response.text)

In [13]:
def check_request_status(obfuscated_id):
    url = f"https://backend.blockadelabs.com/api/v1/imagine/requests/obfuscated-id/{obfuscated_id}"
    
    headers = {"Content-Type": "application/json"}
    
    headers["x-api-key"] = API_KEY
    response = requests.get(url, headers=headers)
    status = response.json()["request"]["status"]
    if status=='complete':
        return True
    else: return False

In [14]:
def download_scene_image(scene, request_id):
    url = f"https://backend.blockadelabs.com/api/v1/imagine/requests/obfuscated-id/{request_id}"
    
    headers = {"Content-Type": "application/json"}
    
    headers["x-api-key"] = API_KEY
    response = requests.get(url, headers=headers)
    image_url = response.json()["request"]["file_url"]
    requests.get(image_url, stream=True).content
    with open(f"scenes/{scene}/skybox.jpg", "wb") as f:
        f.write(requests.get(image_url, stream=True).content)
    print("Skybox downloaded")

In [19]:
# Send request for all scenes
request_ids_for_scenes = {}

for scene, prompt in scene_prompt.items():
    request_ids_for_scenes[scene] = send_image_requests(prompt)

fulfilled_scenes_set = set()
while len(fulfilled_scenes_set)!=len(scene_prompt):
    for scene, request_id in request_ids_for_scenes.items():
        if scene not in fulfilled_scenes_set:
            if check_request_status(request_id):
                fulfilled_scenes_set.add(scene)
                download_scene_image(scene, request_id)
    

Success! Response: {'id': 10314130, 'obfuscated_id': '12fe40145f9db7eb50b916ee8ab2ce13', 'user_id': 110840, 'api_key_id': 5328, 'title': 'World #10314130', 'seed': 1711913825, 'negative_text': None, 'prompt': '"An expansive view of Rome\'s seven hills, highlighting the sprawling city made of marble. The colossal Colosseum vibrates with cheers, gladiators locked in combat. Arching aqueducts snake across, supplying water to this metropolis marking the crossroads of power and aspiration."', 'username': 'ketisulamanidze', 'status': 'dispatched', 'queue_position': 0, 'file_url': '', 'thumb_url': '', 'depth_map_url': '', 'remix_imagine_id': None, 'remix_obfuscated_id': None, 'isMyFavorite': False, 'created_at': '2024-03-01T06:20:22+00:00', 'updated_at': '2024-03-01T06:20:22+00:00', 'error_message': None, 'pusher_channel': 'status_update_12fe40145f9db7eb50b916ee8ab2ce13', 'pusher_event': 'status_update', 'type': 'skybox', 'skybox_style_id': 2, 'skybox_id': 2, 'skybox_style_name': 'Fantasy', '

# Image to depth image

In [31]:
!git clone https://github.com/LiheYoung/Depth-Anything.git

Cloning into 'Depth-Anything'...
remote: Enumerating objects: 406, done.[K
remote: Counting objects: 100% (141/141), done.[K
remote: Compressing objects: 100% (114/114), done.[K
remote: Total 406 (delta 76), reused 71 (delta 27), pack-reused 265[K
Receiving objects: 100% (406/406), 237.89 MiB | 29.37 MiB/s, done.
Resolving deltas: 100% (128/128), done.
Updating files: 100% (219/219), done.


In [32]:
!pip install -r Depth-Anything/requirements.txt

Collecting gradio_imageslider (from -r Depth-Anything/requirements.txt (line 1))
  Downloading gradio_imageslider-0.0.18-py3-none-any.whl.metadata (10 kB)
Collecting gradio==4.14.0 (from -r Depth-Anything/requirements.txt (line 2))
  Downloading gradio-4.14.0-py3-none-any.whl.metadata (15 kB)
Collecting torchvision (from -r Depth-Anything/requirements.txt (line 4))
  Downloading torchvision-0.17.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting opencv-python (from -r Depth-Anything/requirements.txt (line 5))
  Downloading opencv_python-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting gradio-client==0.8.0 (from gradio==4.14.0->-r Depth-Anything/requirements.txt (line 2))
  Downloading gradio_client-0.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting torch (from -r Depth-Anything/requirements.txt (line 3))
  Downloading torch-2.2.1-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-nccl-cu12==2.19.3 (from torch

# Scenes to speech

In [35]:
for index in range(len(scene_map)):
    !python Depth-Anything/run.py --encoder vitl --img-path f'scene/Scene {index+1}/skybox.jpg' --outdir f'scene/Scene {index+1}/skybox_depth.jpg' --pred-only --grayscale

config.json: 100%|██████████████████████████████| 116/116 [00:00<00:00, 199kB/s]
Traceback (most recent call last):
  File "/workspace/keti_zura/Depth-Anything/run.py", line 34, in <module>
    depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(args.encoder)).to(DEVICE).eval()
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/.miniconda3/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/workspace/.miniconda3/lib/python3.11/site-packages/huggingface_hub/hub_mixin.py", line 277, in from_pretrained
    instance = cls._from_pretrained(
               ^^^^^^^^^^^^^^^^^^^^^
  File "/workspace/.miniconda3/lib/python3.11/site-packages/huggingface_hub/hub_mixin.py", line 485, in _from_pretrained
    model = cls(**model_kwargs)
            ^^^^^^^^^^^^^^^^^^^
  File "/workspace/

In [30]:
from pathlib import Path

for index in range(len(scene_map)):
    scene_speech_file_path = f'scenes/Scene {index + 1}/speech.wav'
    response = client.audio.speech.create(
      model="tts-1",
      voice="onyx",
      input=scene_map[f"Scene {index + 1}"],
    )
    response.stream_to_file(scene_speech_file_path)

  response.stream_to_file(scene_speech_file_path)


# Scenes to background audio prompts

In [21]:
sound_descriptions = []
for scene in scene_map:
    prompt = f"Based on the following scene: {scene_map[scene]} Write a 1 sentence song description, specifying instruments and style."

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant designed to generate a 1 sentence description for a background music"},
            {"role": "user", "content": prompt},
        ]
    )
    
    sound_descriptions.append(response.choices[0].message.content)

# Scenes to background audio

In [23]:
!pip install pydub


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [24]:
from pydub import AudioSegment

def get_mp3_duration(file_path):
    audio = AudioSegment.from_file(file_path, format="mp3")
    duration_seconds = len(audio) / 1000.0
    return duration_seconds

In [40]:
!pip install torch==2.1.0

Collecting torch==2.1.0
  Using cached torch-2.1.0-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting nvidia-nccl-cu12==2.18.1 (from torch==2.1.0)
  Using cached nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)
Collecting triton==2.1.0 (from torch==2.1.0)
  Using cached triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Using cached torch-2.1.0-cp311-cp311-manylinux1_x86_64.whl (670.2 MB)
Using cached nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl (209.8 MB)
Using cached triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)
Installing collected packages: triton, nvidia-nccl-cu12, torch
  Attempting uninstall: triton
    Found existing installation: triton 2.2.0
    Uninstalling triton-2.2.0:
      Successfully uninstalled triton-2.2.0
  Attempting uninstall: nvidia-nccl-cu12
    Found existing installation: nvidia-nccl-cu12 2.19.3
    Uninstalling nvidia-nccl-cu12-2.19.3:
 

In [41]:
!pip install -r requirements.txt

Collecting git+https://github.com/huggingface/transformers.git (from -r requirements.txt (line 1))
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-8yn0htgl
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-8yn0htgl
  Resolved https://github.com/huggingface/transformers.git to commit e7b983706586c0b809437851f3ba5863b4eda9c0
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [42]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import scipy

In [43]:
# Load the MusicGen model
processor = AutoProcessor.from_pretrained(
    "facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained(
    "facebook/musicgen-small")

preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/7.87k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

In [45]:
import math
for scene in scene_map.keys():
    inputs = processor(
        text=scene_map[scene],
        padding=True,
        return_tensors="pt",
    )
    audio_length = get_mp3_duration(f'scenes/{scene}/speech.wav')
    sampling_rate = model.config.audio_encoder.sampling_rate
    # Generate the audio with long duration
    audio_values = model.generate(**inputs, max_new_tokens=math.ceil(audio_length*256/5))
    # Save the wav file into your system
    scipy.io.wavfile.write(f"scenes/{scene}/background_music.wav", rate=sampling_rate, data=audio_values[0, 0].numpy())