##1. Image-Based Short Video Generation Using AI

In [None]:
# Install required libraries
!pip install -q diffusers transformers accelerate torch torchvision safetensors imageio imageio-ffmpeg

# Imports
from diffusers import StableVideoDiffusionPipeline
import torch
from PIL import Image
import numpy as np
import imageio
from google.colab import files  # for uploading files
from IPython.display import Video, display

# Upload an image
print("Please upload an image file (jpg/png).")
uploaded = files.upload()
image_path = list(uploaded.keys())[0]  # get uploaded file name

image = Image.open(image_path).convert("RGB")
image = image.resize((512, 512))  # resize for model

# Load video diffusion model
model_id = "stabilityai/stable-video-diffusion-img2vid-xt"
pipe = StableVideoDiffusionPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    variant="fp16"
).to("cuda")  # use GPU

# Generate a short video
result = pipe(image, num_frames=6)  # 6 frames for low memory
frames = result.frames[0]

# Save and display video
video_path = "/content/generated_video.mp4"
imageio.mimsave(video_path, [np.array(f) for f in frames], fps=8)

print("Video generated and saved at:", video_path)
display(Video(video_path, embed=True, width=560))


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m725.9 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.5/170.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hPlease upload an image file (jpg/png).


##2. AI-Based Short Video Generation from Image and Text Inputs Using diffusion models

In [None]:
# Install required libraries
!pip install -q diffusers transformers accelerate torch torchvision safetensors imageio imageio-ffmpeg

# Imports
from diffusers import StableVideoDiffusionPipeline, DiffusionPipeline
import torch
import imageio
import numpy as np
from PIL import Image
from google.colab import files
from IPython.display import Video, display

# Upload an image
print("Please upload an image file (JPG or PNG):")
uploaded = files.upload()
image_path = list(uploaded.keys())[0]
print(f"Image uploaded successfully: {image_path}")

image = Image.open(image_path).convert("RGB")
image = image.resize((512, 512))

# Generate video from uploaded image
print("\nGenerating video from uploaded image...")
img_model = "stabilityai/stable-video-diffusion-img2vid-xt"
img_pipe = StableVideoDiffusionPipeline.from_pretrained(
    img_model, torch_dtype=torch.float16, variant="fp16"
).to("cuda")

img_result = img_pipe(image, num_frames=6)  # generate 6 frames
img_frames = img_result.frames[0]

image_video_path = "/content/image_video.mp4"
imageio.mimsave(image_video_path, [np.array(f) for f in img_frames], fps=8)

# Generate video from text prompt
print("\nGenerating video from text prompt...")
text_model = "damo-vilab/text-to-video-ms-1.7b"
text_pipe = DiffusionPipeline.from_pretrained(
    text_model, torch_dtype=torch.float16, variant="fp16"
).to("cuda")

prompt = "A mountain landscape with clouds moving slowly."
text_result = text_pipe(prompt, num_frames=8)
text_frames = text_result.frames[0]

text_video_path = "/content/text_video.mp4"
imageio.mimsave(text_video_path, [np.array(f) for f in text_frames], fps=8)

# Display generated videos
print("\nImage-based Video:")
display(Video(image_video_path, embed=True, width=500))

print("\nText-based Video:")
display(Video(text_video_path, embed=True, width=500))

print("\nBoth videos have been generated successfully.")


##3. Design an AI system that can describe artworks or museum exhibits aloud when an image of an artifact is uploaded.
##The system should automatically analyze the image, generate a descriptive caption, and convert it into speech narration.

In [None]:
# Install libraries
!pip install -q gTTS transformers torch torchvision pillow -U

# Imports
from google.colab import files
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
from IPython.display import Audio, display

# Upload image
print("Please upload an image (jpg/png)")
uploaded = files.upload()
image_path = list(uploaded.keys())[0]
image = Image.open(image_path).convert("RGB")

# Load BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
).to("cuda")

# Prepare inputs and generate caption
inputs = processor(image, return_tensors="pt").to("cuda")
out = model.generate(**inputs, max_length=30)
caption = processor.decode(out[0], skip_special_tokens=True)

print("\nGenerated Description:")
print(caption)

# Convert caption to speech
tts = gTTS(caption, lang="en")
tts.save("image_speech.mp3")

print("\nImage analyzed and converted to speech successfully!")
display(Audio("image_speech.mp3"))
