<a href="https://colab.research.google.com/github/Irene32Mwaniki/AI-Storytelling-Project-/blob/main/AIPROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

# Read the API key from the environment variable
if "GEMINI_API_KEY" in os.environ:
    GEMINI_API_KEY = os.environ['GEMINI_API_KEY']
    print("API key loaded from environment variable.")
else:
    print("Error: GEMINI_API_KEY environment variable not set.")

Gemini Text Story Generator

Rationale: To prompt the user to enter a story idea; sent to Gemini 2.5 Flash for story generation and prints the generated story

In [None]:
!pip install -q transformers pillow google-generativeai

In [None]:
from google import generativeai as genai
import os
#client=genai.Client()

In [None]:
if "GEMINI_API_KEY" not in os.environ:
  print("Please set your Gemini API key in the environment variable GEMINI_API_KEY")
else:
  genai.configure(api_key=os.environ['GEMINI_API_KEY'])
  MODEL="gemini-2.5-flash"

In [None]:
prompt=input("Enter your Story prompt and press enter:\n")
if prompt.strip()=="":
  print("No prompt entered, Exiting.")
else:
    print(f"Generating story for prompt: {prompt}")
    print("It may take a few seconds")
    try:
      model = genai.GenerativeModel(model_name=MODEL)
      resp=model.generate_content(contents=[prompt])
      print("\n----Generated Story----\n")
      print(resp.text)
    except Exception as e:
      print(f"Error generating story: {e}")

Image Caption and Story Generator

To upload an image via Colab, use BLIP to create an image caption, and send the caption as a prompt to Gemini to generate a story inspired by the image

In [None]:
!pip install -q transformers pillow google-generativeai timm # pytorch image package

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image
import requests
import os
from google import genai
import io

In [None]:
if "GEMINI_API_KEY" not in os.environ:
  print("Please set your Gemini API key in the environment variable GEMINI_API_KEY")
else:
  from google import generativeai as genai
  genai.configure(api_key=os.environ['GEMINI_API_KEY'])
  MODEL="gemini-2.5-flash"

In [None]:
processor=BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model=BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

In [None]:
from google.colab import files
uploaded=files.upload()

for fn in uploaded.keys():
  images=Image.open(fn).convert('RGB')
  display(images)

In [None]:
for fn in uploaded.keys():
    images = Image.open(fn).convert('RGB')
    inputs = processor(images=images, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    print(f"Caption generated by BLIP for {fn}:")
    print(caption)
    print("-" * 20)

In [None]:
story_prompt=(f"Write a short story (around 300 words) based on this scene description: {caption}")
print(story_prompt)

print("Sending this to Gemini. \n")

try:
  model = genai.GenerativeModel(model_name=MODEL)
  response=model.generate_content(contents=[story_prompt])
  story=response.text
  print("\n-----Generated Story -----\n")
  print(story)
except Exception as e:
  print(f"Error generating story: {e}")

In [None]:
with open("generated_story.txt", "w") as f:
  f.write(story)

from google.colab import files
files.download("generated_story.txt")

Multi-Image Story + GUI

The goal is to:
- Upload multiple images
- Use BLIP to generate captions for each image
- The user needs to be able to select a tone (romatic, or sci-fi etc) and story length using ipywidgets
- Gemini generates a multi-chapter story with outline and full text
- saves the full story as text file

In [None]:
!pip install -q ipywidgets google-generativeai

In [None]:
from google.colab import files
from PIL import Image
import io


uploaded=files.upload()

images=[]
image_names=[]

for name, file in uploaded.items():
  image=Image.open(io.BytesIO(file)).convert('RGB')
  image_names.append(name)
  images.append(image)
  display(image)

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image
import requests
import os
from google import genai
import io

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from google import generativeai as genai

processor=BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model=BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

captions=[]

for img in images:
  inputs=processor(images=img,return_tensors='pt')
  out=blip_model.generate(**inputs, max_new_tokens=30)
  caption=processor.decode(out[0],skip_special_tokens=True)
  captions.append(caption)


  print("Captions generates from images:")
  for i, caption in enumerate(captions):
    print(f"{image_names[i]}: {caption}")

In [None]:
import ipywidgets as widgets
from IPython.display import display , clear_output


tone_dropdown=widgets.Dropdown(
    options=["Mystery", "Sci-fi", "Horror", "Adventorous", "Romantic"],
    value="Sci-fi",
    description="Tone:"
)

length_dropdown=widgets.Dropdown(
    options=["Short(100-200 words)", "Medium(300-400 words)", "Long(500-700 words)"],
    value="Medium(300-400 words)",
    description="Length:"
)

generate_button=widgets.Button(description="Generate Story")
output_area=widgets.Output()

display(tone_dropdown, length_dropdown, generate_button, output_area)

In [56]:
def on_generate_button_clicked(b):
    with output_area:
        clear_output()
        selected_tone = tone_dropdown.value
        length_map = {
            "Short(100-200 words)": "100-200 words",
            "Medium(300-400 words)": "300 - 400 words",
            "Long(500-700 words)": "500 - 700 words"
        }
        length = length_map[length_dropdown.value]

        # Define MODEL here to ensure it's accessible
        MODEL="gemini-2.5-flash"

        caption_prompt = "\n".join([f"-{c}" for c in captions])

        outline_prompt = (
            f"Using the following scene descriptions, create a 4-chapter story outline. "
            f"Each chapter should have a title and a short summary.\n\n"
            f"{caption_prompt}\n\nOutline:"
        )

        try:
            model = genai.GenerativeModel(model_name=MODEL)
            outline_response = model.generate_content(
                contents=[outline_prompt],
                generation_config=genai.GenerationConfig(max_output_tokens=500) # Added max_output_tokens for outline
            )
            outline_text = outline_response.text
            print(" Story Outline:\n")
            print(outline_text)

            full_story = ""
            # Determine max_tokens based on selected length for chapters
            chapter_max_tokens_map = {
                "Short(100-200 words)": 200,
                "Medium(300-400 words)": 400,
                "Long(500-700 words)": 700
            }
            chapter_max_tokens = chapter_max_tokens_map[length_dropdown.value]


            for i in range(1, 5):  # Assuming 4 chapters based on the outline prompt
                chapter_prompt = (
                    f"Using the outline below, write Chapter {i} in a {selected_tone} tone. "
                    f"Make it {length}. Add vivid details, good pacing, and consistent characters.\n\n"
                    f"{outline_text}\n\nChapter {i}:"
                )
                chapter_response = model.generate_content(
                    contents=[chapter_prompt],
                    generation_config=genai.GenerationConfig(max_output_tokens=chapter_max_tokens) # Added max_output_tokens for chapters
                )
                chapter_text = chapter_response.text
                print(f"\n Chapter {i}: \n")
                print(chapter_text)
                full_story += f"\n\nChapter {i}: \n{chapter_text}"

            with open("multi_image_story.txt", "w") as f:
                f.write(full_story)
            print("\nStory saved as multi_image_story.txt")

            from google.colab import files
            files.download("multi_image_story.txt")
        except Exception as e:
            print("Error generating story: ", e)

generate_button.on_click(on_generate_button_clicked)

Export to PDF and Audio

The goal is to be able to to:
1. Convert the generated story to a pdf file using ReportLab
2. Conver the story text to mp3 file with different accents using gTTS
3. To allow the user to download both audio and text from Colab

In [None]:
!pip install -q gtts reportlab

In [None]:
story_text = """
**Chapter 1: The Muddy Promise**
*   **Summary:** Lily and Daisy, two inseparable friends, spend their days in joyful abandon, their laughter echoing across the field as they create elaborate mud castles. Nearby, Old Mr. Silas sits peacefully with his loyal dog, Buster, a quiet fixture in their daily lives. The girls often overhear snippets of village tales about the "Whispering Falls" and the mysteries hidden beyond the familiar path, sparking a shared dream of adventure that begins to overshadow their muddy play.

**Chapter 2: A Glimmer of the Falls**
*   **Summary:** Driven by their growing curiosity, Lily and Daisy decide to finally seek out the legendary Whispering Falls. They bid farewell to their usual play spot, packing a small satchel with provisions. As they venture further than ever before, they pass Mr. Silas and Buster once more. Mr. Silas, sensing their budding quest, offers a knowing smile and a cryptic word of encouragement about listening to the land. The girls follow a winding, overgrown track, their excitement building until they catch their first breathtaking glimpse of the majestic waterfall, its cascades framing a unique road carved straight through its base.

**Chapter 3: The Veiled Passage**
*   **Summary:** Standing before the powerful Whispering Falls, Lily and Daisy are awestruck by its beauty and the strange duality of the ancient water meeting the man-made road. They bravely traverse the damp, echoing passage of the road through the falls, feeling as though they are crossing a threshold into another realm. Emerging on the other side, the air feels different, the forest denser and more wild. They realize the waterfall was not an endpoint, but a gateway to a deeper, more untamed part of the world, where their true adventure begins.

**Chapter 4: The Forest's Secret Keeper**
*   **Summary:** Deeper into the ancient forest, sunlight filters through the dense canopy, dappling the mossy ground. Lily and Daisy, their senses alive with the sights and sounds of the wild, follow the rustling leaves and distant calls. Their journey culminates in a hidden clearing where, hanging playfully from a high tree branch, a curious monkey observes them with intelligent, ancient eyes. The monkey seems to be a silent guardian, a symbol of the forest's untamed spirit. The girls understand that the true "secret" isn't something to find, but the profound connection to nature they've experienced, leaving the forest forever changed by its wild magic.
"""

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from gtts import gTTS


def export_pdf(text,filename="story_text"):
  c=canvas.Canvas(f"{filename}.pdf",pagesize=letter)
  width, height=letter
  text_object=c.beginText(40,height-40)
  text_object.setFont("Helvetica",12)

  for line in text.split('\n'):
    for subline in [line[i:i+90] for i in range(0,len(line),90)]:
      text_object.textLine(subline)
  c.drawText(text_object)
  c.showPage()
  c.save()

export_pdf(story_text)

from google.colab import files
files.download("story_text.pdf")

In [None]:
from gtts import gTTS
from IPython.display import Audio, display
from google.colab import files

voices = {
    "Default English (US Female)": {"lang": "en", "gender": "female", "accent": "us"},
    "Default English (US Male)": {"lang": "en", "gender": "male", "accent": "us"},
    "British Accent": { "lang": "en", "tld": "co.uk"},
    "Australian Accnet": { "lang": "en", "tld": "com.au"},
    "Indian Accent": { "lang": "en", "tld": "co.in"},
    "French Accent": { "lang": "fr", "tld": "fr"},
    "Spanish Accent": { "lang": "es", "tld": "es"},
    "German Accent": { "lang": "de", "tld": "de"},
    "Italian Accent": { "lang": "it", "tld": "it"},
    "Japanese Accent": { "lang": "ja", "tld": "jp"},
    "Slow Reading Voice": { "lang": "en", "slow": True}
}

for label,options in voices.items():
  print(f"Generating Audio: {label}")

  tts=gTTS(
      text=story_text,
      lang=options["lang"],
      tld=options.get("tld", "com"),
      slow=options.get("slow", False)
  )

  filename=f"{label.replace(' ', '_').lower()}.mp3"

  tts.save(filename)

  display(Audio(filename=filename, autoplay=False))

  files.download(filename)

Streamlit APP

The goal is to:
1. Turn everything into a deployable Streamlit web app
2. Lets the user upload images, choose the tone and story length
3. Exports results as PDF or audio
4. Includes optional ngrok itnegration for public sharing

In [55]:
%%writefile app_streamlit_story.py
import streamlit as st
from PIL import Image
import io, requests, os
import textwrap
from gtts import gTTS
from transformers import BlipProcessor, BlipForConditionalGeneration
from google import genai
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.lib.utils import ImageReader
from pyngrok import ngrok
from tempfile import NamedTemporaryFile
import google.generativeai as genai
import torch

#authentication
NGROK_AUTH_TOKEN = os.environ.get("NGROK_AUTH_TOKEN")
BACKGROUND_IMAGE_URL= "https://i.postimg.cc/t4BYWppf/Victoria-falls-1.jpg"
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")


#streamlit Page Setup
st.set_page_config(page_title="StoryTeller", layout="wide")

st.markdown(
    f"""
    <style>
    .stApp{{
        background-image: url("{BACKGROUND_IMAGE_URL}");
        background-attachment: fixed;
        background-size: cover;
    }}
    section[data-testid="stSidebar"]{{
        background:rgba(0,0,0,0.3);
        background-filter:blur(10px);
        border-radius:12px;
        padding:10px;
    }}
    div[data-testid="stFileUploader"]{{
        background:rgba(255,255,255,0.2);
        background-filter:blur(10px);
        border-radius:12px;
        padding:10px;
    }}
    html, body, h1, h2, h3, h4, h5, h6, p,div, span, label, li, input, textarea {{
      color:#93A8AC !important;
    }}
    .stButton>button, .stDownloadButton>button {{
      color:#93A8AC !important;
      border-color:#93A8AC;
    }}
    </style>
    """,
    unsafe_allow_html=True
)

st.title("Multi-Image AI Storyteller")
st.markdown("Upload images - Generate story - Export as PDF & MP3")

with st.sidebar:
  tone=st.selectbox("Tone", ["Descriptive", "Narrative", "Factual", "Simple"])
  length_label=st.selectbox("Length", ["Short (100-200 words)", "Medium (300-400 words)", "Long (500-700 words)"])
  start_ngrok = st.checkbox("Start ngrok tunnel")
  if start_ngrok:
    if NGROK_AUTH_TOKEN:
      ngrok.set_auth_token(NGROK_AUTH_TOKEN)
      url=ngrok.connect(8501)
      st.success(f"Public URL: {url}")
    else:
      st.error("ngrok auth token not set in environment variables.")

uploaded_images = st.file_uploader("Upload multiple images", type=["jpg", "jpeg", "png"], accept_multiple_files=True)

#caption model
@st.cache_resource
def load_models():
  processor=BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
  blip_model=BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
  return processor, blip_model

processor, blip_model = load_models()


#config gemini
if GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
    @st.cache_resource
    def load_gemini_model():
      return genai.GenerativeModel(model_name="gemini-2.5-flash")

    gemini_model=load_gemini_model()
else:
    st.error("GEMINI_API_KEY not set in environment variables.")


def get_captions(images):
  captions=[]
  for img in images:
    if img.mode != "RGB":
      img = img.convert("RGB")
    inputs=processor(images=img, return_tensors="pt").to(blip_model.device)
    out = blip_model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    captions.append(caption)
  return captions

def generated_story(captions, tone, length_label):
  length_map = {
      "Short (100-200 words)": (100, 200, 400),
      "Medium (300-400 words)": (300, 400, 800),
      "Long (500-700 words)": (500, 700, 1400)
  }
  min_words, max_words, max_tokens = length_map[length_label]

  prompt = (
      f"You are a creative writer. Write a {tone.lower()} story based on the following image captions:"
      + "\n".join([f"- {cap}" for cap in captions])
      + f" The story should be vivid, engaging, and emotionally rich, with a coherent beginning, middle and end."
      + f" Make it approximately between {min_words} and {max_words} words long."
  )

  try:
    response = gemini_model.generate_content(
        contents=[prompt],
        generation_config=genai.GenerationConfig(
            temperature=0.9,
            top_p=0.95,
            max_output_tokens=max_tokens
        )
    )
    # Check if the response contains candidates and if the finish reason is not 'SAFETY'
    if response.candidates and response.candidates[0].finish_reason != 2: # Finish reason 2 typically indicates SAFETY
        return response.text.strip()
    else:
        # Provide a more specific message if the response was blocked
        return "Story generation was blocked, possibly due to safety concerns with the content."

  except Exception as e:
      return f"Error generating story: {e}"

#pdf generation
def create_pdf(story_text, images):
  buffer = io.BytesIO()
  c = canvas.Canvas(buffer, pagesize=A4)
  w, h = A4

  try:
          bg_img = Image.open(requests.get(BACKGROUND_IMAGE_URL, stream=True).raw).convert('RGB')
          bg=ImageReader(bg_img)
          c.drawImage(bg, 0, 0, width=w, height=h)
  except:
          pass

  c.setFont("Helvetica-Bold", 16)
  c.drawString(50,h - 50, "Generated Story")

  text = textwrap.wrap(story_text, width=100)
  y = h - 80
  for line in text:
          if y < 80:
            c.showPage()
            y = h - 80
          c.drawString(50, y, line)
          y -= 15
  if images:
          c.showPage()
          c.setFont("Helvetica-Bold", 16)
          c.drawString(50, h - 50, "Uploaded Images")
          x, y = 50, h - 150
          for img in images:
              img.thumbnail((200, 200))
              c.drawImage(ImageReader(img), x, y, width=200, height=200, preserveAspectRatio=True)
              x += 220
              if x > w - 50 - 200: # Adjusted condition to prevent image going out of bounds
                  x = 50
                  y -= 220
                  if y < 80: # Check if new row would go off page
                      c.showPage()
                      y = h - 150
                      x = 50 # Reset x for the new page
          c.save()
          buffer.seek(0)
          return buffer

 # audio generation
def create_audio(story):
    audio_bytes = io.BytesIO()
    tts = gTTS(text=story, lang="en")
    tts.write_to_fp(audio_bytes)
    audio_bytes.seek(0)
    return audio_bytes

if st.button("Generate Story", key="generate_story_button_1") and uploaded_images:
    if GEMINI_API_KEY: # Only proceed if API key is available
        pil_images = [Image.open(img) for img in uploaded_images]
        with st.spinner( "Generating captions..."):
          captions = get_captions(pil_images)
          for i, cap in enumerate(captions):
              st.write(f"{i+1}: {cap}")
        with st.spinner("Generating story..."):
          story = generated_story(captions, tone, length_label)
          st.success("Story generated!")
          st.write(story)
        with st.spinner("Creating PDF..."):
          pdf_file=create_pdf(story, pil_images)
          st.download_button("Download Story as PDF", data=pdf_file, file_name="generated_story.pdf", mime="application/pdf")
        with st.spinner("Creating Audio..."):
          audio_file=create_audio(story)
          st.download_button("Download Story as MP3", data=audio_file, file_name="generated_story.mp3", mime="audio/mp3")
    else:
        st.error("Gemini API key is not set. Please set it in your environment variables.")

elif not uploaded_images and st.button("Generate Story", key="generate_story_button_2"): # Added condition to show warning only when button is clicked
      st.warning("Upload at least one image to begin.")

Writing app_streamlit_story.py


In [None]:
!pip install -q streamlit pyngrok transformers torch gtts reportlab Pillow
!streamlit run app_streamlit_story.py --server.port 8501 &>/content/log.txt &

from pyngrok import ngrok
ngrok.set_auth_token("33XlRRRqPXH9qe5tPEXfviVlTI6_477Si91yBwWMkY51w1vCx")
public_url=ngrok.connect(8501)
print("Public URL:", public_url)

In [None]:
with open('/content/log.txt', 'r') as f:
    print(f.read())

# Task
Add background music to the MP3 output (via pydub) for creative flair.

## Install necessary library

### Subtask:
Install pydub in the Colab environment.


**Rationale**:
The subtask is to install the pydub library. I will use pip to install the package.



In [58]:
!pip install -q pydub

## Add background music

### Subtask:
Provide a way to include background music. This could be by uploading a music file or specifying a URL to a music file.


**Rationale**:
Add a file uploader in the Streamlit sidebar for background music and store the uploaded file.



In [64]:
%%writefile app_streamlit_story.py
import streamlit as st
from PIL import Image
import io, requests, os
import textwrap
from gtts import gTTS
from transformers import BlipProcessor, BlipForConditionalGeneration
from google import genai
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.lib.utils import ImageReader
from pyngrok import ngrok
from tempfile import NamedTemporaryFile
import google.generativeai as genai
import torch
from pydub import AudioSegment

#authentication
NGROK_AUTH_TOKEN = os.environ.get("NGROK_AUTH_TOKEN")
BACKGROUND_IMAGE_URL= "https://i.postimg.cc/t4BYWppf/Victoria-falls-1.jpg"
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")


#streamlit Page Setup
st.set_page_config(page_title="StoryTeller", layout="wide")

st.markdown(
    f"""
    <style>
    .stApp{{
        background-image: url("{BACKGROUND_IMAGE_URL}");
        background-attachment: fixed;
        background-size: cover;
    }}
    section[data-testid="stSidebar"]{{
        background:rgba(0,0,0,0.3);
        background-filter:blur(10px);
        border-radius:12px;
        padding:10px;
    }}
    div[data-testid="stFileUploader"]{{
        background:rgba(255,255,255,0.2);
        background-filter:blur(10px);
        border-radius:12px;
        padding:10px;
    }}
    html, body, h1, h2, h3, h4, h5, h6, p,div, span, label, li, input, textarea {{
      color:#93A8AC !important;
    }}
    .stButton>button, .stDownloadButton>button {{
      color:#93A8AC !important;
      border-color:#93A8AC;
    }}
    </style>
    """,
    unsafe_allow_html=True
)

st.title("Multi-Image AI Storyteller")
st.markdown("Upload images - Generate story - Export as PDF & MP3")

with st.sidebar:
  tone=st.selectbox("Tone", ["Descriptive", "Narrative", "Factual", "Simple"])
  length_label=st.selectbox("Length", ["Short (100-200 words)", "Medium (300-400 words)", "Long (500-700 words)"])
  uploaded_music = st.file_uploader("Upload Background Music", type=["mp3", "wav"])
  start_ngrok = st.checkbox("Start ngrok tunnel")
  if start_ngrok:
    if NGROK_AUTH_TOKEN:
      ngrok.set_auth_token(NGROK_AUTH_TOKEN)
      url=ngrok.connect(8501)
      st.success(f"Public URL: {url}")
    else:
      st.error("ngrok auth token not set in environment variables.")

uploaded_images = st.file_uploader("Upload multiple images", type=["jpg", "jpeg", "png"], accept_multiple_files=True)

#caption model
@st.cache_resource
def load_models():
  processor=BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
  blip_model=BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
  return processor, blip_model

processor, blip_model = load_models()


#config gemini
if GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
    @st.cache_resource
    def load_gemini_model():
      return genai.GenerativeModel(model_name="gemini-2.5-flash")

    gemini_model=load_gemini_model()
else:
    st.error("GEMINI_API_KEY not set in environment variables.")


def get_captions(images):
  captions=[]
  for img in images:
    if img.mode != "RGB":
      img = img.convert("RGB")
    inputs=processor(images=img, return_tensors="pt").to(blip_model.device)
    out = blip_model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    captions.append(caption)
  return captions

def generated_story(captions, tone, length_label):
  length_map = {
      "Short (100-200 words)": (100, 200, 400),
      "Medium (300-400 words)": (300, 400, 800),
      "Long (500-700 words)": (500, 700, 1400)
  }
  min_words, max_words, max_tokens = length_map[length_label]

  prompt = (
      f"You are a creative writer. Write a {tone.lower()} story based on the following image captions:"
      + "\n".join([f"- {cap}" for cap in captions])
      + f" The story should be vivid, engaging, and emotionally rich, with a coherent beginning, middle and end."
      + f" Make it approximately between {min_words} and {max_words} words long."
  )

  try:
    response = gemini_model.generate_content(
        contents=[prompt],
        generation_config=genai.GenerationConfig(
            temperature=0.9,
            top_p=0.95,
            max_output_tokens=max_tokens
        )
    )
    # Check if the response contains candidates and if the finish reason is not 'SAFETY'
    if response.candidates and response.candidates[0].finish_reason != 2: # Finish reason 2 typically indicates SAFETY
        return response.text.strip()
    else:
        # Provide a more specific message if the response was blocked
        return "Story generation was blocked, possibly due to safety concerns with the content."

  except Exception as e:
      return f"Error generating story: {e}"

#pdf generation
def create_pdf(story_text, images):
  buffer = io.BytesIO()
  c = canvas.Canvas(buffer, pagesize=A4)
  w, h = A4

  try:
          bg_img = Image.open(requests.get(BACKGROUND_IMAGE_URL, stream=True).raw).convert('RGB')
          bg=ImageReader(bg_img)
          c.drawImage(bg, 0, 0, width=w, height=h)
  except:
          pass

  c.setFont("Helvetica-Bold", 16)
  c.drawString(50,h - 50, "Generated Story")

  text = textwrap.wrap(story_text, width=100)
  y = h - 80
  for line in text:
          if y < 80:
            c.showPage()
            y = h - 80
          c.drawString(50, y, line)
          y -= 15
  if images:
          c.showPage()
          c.setFont("Helvetica-Bold", 16)
          c.drawString(50, h - 50, "Uploaded Images")
          x, y = 50, h - 150
          for img in images:
              img.thumbnail((200, 200))
              c.drawImage(ImageReader(img), x, y, width=200, height=200, preserveAspectRatio=True)
              x += 220
              if x > w - 50 - 200: # Adjusted condition to prevent image going out of bounds
                  x = 50
                  y -= 220
                  if y < 80: # Check if new row would go off page
                      c.showPage()
                      y = h - 150
                      x = 50 # Reset x for the new page
          c.save()
          buffer.seek(0)
          return buffer

 # audio generation
def create_audio(story, background_music=None):
    audio_bytes = io.BytesIO()
    tts = gTTS(text=story, lang="en")
    tts.write_to_fp(audio_bytes)
    audio_bytes.seek(0)

    if background_music:
        try:
            story_audio = AudioSegment.from_file(audio_bytes, format="mp3")
            background_audio = AudioSegment.from_file(io.BytesIO(background_music.getvalue()), format=background_music.type.split('/')[-1])

            # Ensure background music is at least as long as the story audio
            if len(background_audio) < len(story_audio):
                background_audio = background_audio * (len(story_audio) // len(background_audio) + 1)

            # Overlay background music (adjust volume as needed)
            combined_audio = story_audio.overlay(background_audio[:len(story_audio)] - 15) # -15dB for background

            combined_audio_bytes = io.BytesIO()
            combined_audio.export(combined_audio_bytes, format="mp3")
            combined_audio_bytes.seek(0)
            return combined_audio_bytes
        except Exception as e:
            st.warning(f"Could not add background music: {e}")
            return audio_bytes # Return story audio only if background music fails
    else:
        return audio_bytes

if st.button("Generate Story", key="generate_story_button_1") and uploaded_images:
    if GEMINI_API_KEY: # Only proceed if API key is available
        pil_images = [Image.open(img) for img in uploaded_images]
        with st.spinner( "Generating captions..."):
          captions = get_captions(pil_images)
          for i, cap in enumerate(captions):
              st.write(f"{i+1}: {cap}")
        with st.spinner("Generating story..."):
          story = generated_story(captions, tone, length_label)
          st.success("Story generated!")
          st.write(story)
        with st.spinner("Creating PDF..."):
          pdf_file=create_pdf(story, pil_images)
          st.download_button("Download Story as PDF", data=pdf_file, file_name="generated_story.pdf", mime="application/pdf")
        with st.spinner("Creating Audio..."):
          audio_file=create_audio(story, uploaded_music)
          st.download_button("Download Story as MP3", data=audio_file, file_name="generated_story.mp3", mime="audio/mp3")
    else:
        st.error("Gemini API key is not set. Please set it in your environment variables.")

elif not uploaded_images and st.button("Generate Story", key="generate_story_button_2"): # Added condition to show warning only when button is clicked
      st.warning("Upload at least one image to begin.")

Overwriting app_streamlit_story.py


## Update streamlit app

### Subtask:
Regenerate the app_streamlit_story.py file with the modified create_audio function.


**Rationale**:
Write the updated app_streamlit_story.py file with the modified create_audio function that handles background music.



In [62]:
%%writefile app_streamlit_story.py
import streamlit as st
from PIL import Image
import io, requests, os
import textwrap
from gtts import gTTS
from transformers import BlipProcessor, BlipForConditionalGeneration
from google import genai
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.lib.utils import ImageReader
from pyngrok import ngrok
from tempfile import NamedTemporaryFile
import google.generativeai as genai
import torch
from pydub import AudioSegment

#authentication
NGROK_AUTH_TOKEN = os.environ.get("NGROK_AUTH_TOKEN")
BACKGROUND_IMAGE_URL= "https://i.postimg.cc/t4BYWppf/Victoria-falls-1.jpg"
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")


#streamlit Page Setup
st.set_page_config(page_title="StoryTeller", layout="wide")

st.markdown(
    f"""
    <style>
    .stApp{{
        background-image: url("{BACKGROUND_IMAGE_URL}");
        background-attachment: fixed;
        background-size: cover;
    }}
    section[data-testid="stSidebar"]{{
        background:rgba(0,0,0,0.3);
        background-filter:blur(10px);
        border-radius:12px;
        padding:10px;
    }}
    div[data-testid="stFileUploader"]{{
        background:rgba(255,255,255,0.2);
        background-filter:blur(10px);
        border-radius:12px;
        padding:10px;
    }}
    html, body, h1, h2, h3, h4, h5, h6, p,div, span, label, li, input, textarea {{
      color:#93A8AC !important;
    }}
    .stButton>button, .stDownloadButton>button {{
      color:#93A8AC !important;
      border-color:#93A8AC;
    }}
    </style>
    """,
    unsafe_allow_html=True
)

st.title("Multi-Image AI Storyteller")
st.markdown("Upload images - Generate story - Export as PDF & MP3")

with st.sidebar:
  tone=st.selectbox("Tone", ["Descriptive", "Narrative", "Factual", "Simple"])
  length_label=st.selectbox("Length", ["Short (100-200 words)", "Medium (300-400 words)", "Long (500-700 words)"])
  uploaded_music = st.file_uploader("Upload Background Music", type=["mp3", "wav"])
  start_ngrok = st.checkbox("Start ngrok tunnel")
  if start_ngrok:
    if NGROK_AUTH_TOKEN:
      ngrok.set_auth_token(NGROK_AUTH_TOKEN)
      url=ngrok.connect(8501)
      st.success(f"Public URL: {url}")
    else:
      st.error("ngrok auth token not set in environment variables.")

uploaded_images = st.file_uploader("Upload multiple images", type=["jpg", "jpeg", "png"], accept_multiple_files=True)

#caption model
@st.cache_resource
def load_models():
  processor=BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
  blip_model=BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
  return processor, blip_model

processor, blip_model = load_models()


#config gemini
if GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
    @st.cache_resource
    def load_gemini_model():
      return genai.GenerativeModel(model_name="gemini-2.5-flash")

    gemini_model=load_gemini_model()
else:
    st.error("GEMINI_API_KEY not set in environment variables.")


def get_captions(images):
  captions=[]
  for img in images:
    if img.mode != "RGB":
      img = img.convert("RGB")
    inputs=processor(images=img, return_tensors="pt").to(blip_model.device)
    out = blip_model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    captions.append(caption)
  return captions

def generated_story(captions, tone, length_label):
  length_map = {
      "Short (100-200 words)": (100, 200, 400),
      "Medium (300-400 words)": (300, 400, 800),
      "Long (500-700 words)": (500, 700, 1400)
  }
  min_words, max_words, max_tokens = length_map[length_label]

  prompt = (
      f"You are a creative writer. Write a {tone.lower()} story based on the following image captions:"
      + "\n".join([f"- {cap}" for cap in captions])
      + f" The story should be vivid, engaging, and emotionally rich, with a coherent beginning, middle and end."
      + f" Make it approximately between {min_words} and {max_words} words long."
  )

  try:
    response = gemini_model.generate_content(
        contents=[prompt],
        generation_config=genai.GenerationConfig(
            temperature=0.9,
            top_p=0.95,
            max_output_tokens=max_tokens
        )
    )
    # Check if the response contains candidates and if the finish reason is not 'SAFETY'
    if response.candidates and response.candidates[0].finish_reason != 2: # Finish reason 2 typically indicates SAFETY
        return response.text.strip()
    else:
        # Provide a more specific message if the response was blocked
        return "Story generation was blocked, possibly due to safety concerns with the content."

  except Exception as e:
      return f"Error generating story: {e}"

#pdf generation
def create_pdf(story_text, images):
  buffer = io.BytesIO()
  c = canvas.Canvas(buffer, pagesize=A4)
  w, h = A4

  try:
          bg_img = Image.open(requests.get(BACKGROUND_IMAGE_URL, stream=True).raw).convert('RGB')
          bg=ImageReader(bg_img)
          c.drawImage(bg, 0, 0, width=w, height=h)
  except:
          pass

  c.setFont("Helvetica-Bold", 16)
  c.drawString(50,h - 50, "Generated Story")

  text = textwrap.wrap(story_text, width=100)
  y = h - 80
  for line in text:
          if y < 80:
            c.showPage()
            y = h - 80
          c.drawString(50, y, line)
          y -= 15
  if images:
          c.showPage()
          c.setFont("Helvetica-Bold", 16)
          c.drawString(50, h - 50, "Uploaded Images")
          x, y = 50, h - 150
          for img in images:
              img.thumbnail((200, 200))
              c.drawImage(ImageReader(img), x, y, width=200, height=200, preserveAspectRatio=True)
              x += 220
              if x > w - 50 - 200: # Adjusted condition to prevent image going out of bounds
                  x = 50
                  y -= 220
                  if y < 80: # Check if new row would go off page
                      c.showPage()
                      y = h - 150
                      x = 50 # Reset x for the new page
          c.save()
          buffer.seek(0)
          return buffer

 # audio generation
def create_audio(story, background_music=None):
    audio_bytes = io.BytesIO()
    tts = gTTS(text=story, lang="en")
    tts.write_to_fp(audio_bytes)
    audio_bytes.seek(0)

    if background_music:
        try:
            story_audio = AudioSegment.from_file(audio_bytes, format="mp3")
            background_audio = AudioSegment.from_file(io.BytesIO(background_music.getvalue()), format=background_music.type.split('/')[-1])

            # Ensure background music is at least as long as the story audio
            if len(background_audio) < len(story_audio):
                background_audio = background_audio * (len(story_audio) // len(background_audio) + 1)

            # Overlay background music (adjust volume as needed)
            combined_audio = story_audio.overlay(background_audio[:len(story_audio)] - 15) # -15dB for background

            combined_audio_bytes = io.BytesIO()
            combined_audio.export(combined_audio_bytes, format="mp3")
            combined_audio_bytes.seek(0)
            return combined_audio_bytes
        except Exception as e:
            st.warning(f"Could not add background music: {e}")
            return audio_bytes # Return story audio only if background music fails
    else:
        return audio_bytes

if st.button("Generate Story", key="generate_story_button_1") and uploaded_images:
    if GEMINI_API_KEY: # Only proceed if API key is available
        pil_images = [Image.open(img) for img in uploaded_images]
        with st.spinner( "Generating captions..."):
          captions = get_captions(pil_images)
          for i, cap in enumerate(captions):
              st.write(f"{i+1}: {cap}")
        with st.spinner("Generating story..."):
          story = generated_story(captions, tone, length_label)
          st.success("Story generated!")
          st.write(story)
        with st.spinner("Creating PDF..."):
          pdf_file=create_pdf(story, pil_images)
          st.download_button("Download Story as PDF", data=pdf_file, file_name="generated_story.pdf", mime="application/pdf")
        with st.spinner("Creating Audio..."):
          audio_file=create_audio(story, uploaded_music)
          st.download_button("Download Story as MP3", data=audio_file, file_name="generated_story.mp3", mime="audio/mp3")
    else:
        st.error("Gemini API key is not set. Please set it in your environment variables.")

elif not uploaded_images and st.button("Generate Story", key="generate_story_button_2"): # Added condition to show warning only when button is clicked
      st.warning("Upload at least one image to begin.")

Overwriting app_streamlit_story.py


## Run streamlit app

### Subtask:
Execute the cell to run the Streamlit application with the new functionality.


**Rationale**:
Execute the code cell to run the Streamlit application and generate the public URL.



In [63]:
!pip install -q streamlit pyngrok transformers torch gtts reportlab Pillow
!streamlit run app_streamlit_story.py --server.port 8501 &>/content/log.txt &

from pyngrok import ngrok
# ngrok token is hardcoded here for demonstration purpose, this should be stored as a secret.
ngrok.set_auth_token("33XlRRRqPXH9qe5tPEXfviVlTI6_477Si91yBwWMkY51w1vCx")
public_url=ngrok.connect(8501)
print("Public URL:", public_url)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
Public URL: NgrokTunnel: "https://forespent-bullishly-saran.ngrok-free.dev" -> "http://localhost:8501"
