## CE Enablement Session - Practice 02
- ## Workflow: Story telling: IMG --> VEO

### 1. (권장) 가상환경 생성 및 활성화
python -m venv venv
#### macOS/Linux:
source venv/bin/activate

#### 2. 필수 파이썬 라이브러리 설치
pip install google-cloud-aiplatform google-cloud-texttospeech moviepy Pillow requests

In [8]:
# ==============================================================================
# 1. 패키지 IMPORT
# ==============================================================================
import json
import time
import os
import base64
from typing import Dict, List, Optional

# Google Cloud & GenAI
import google.auth
import google.auth.transport.requests
import requests
from google.cloud import texttospeech
from google import genai
from google.genai import types
from requests.exceptions import HTTPError

# Media Processing (MoviePy & PIL)
from moviepy.editor import (
    VideoFileClip, AudioFileClip, CompositeVideoClip,
    concatenate_videoclips, ImageClip, CompositeAudioClip
)
import moviepy.video.fx.all as vfx
import moviepy.audio.fx.all as afx
from PIL import Image


# ==============================================================================
# 2. 전역 설정 (CONFIGURATION)
# ==============================================================================
# --- Google Cloud & Vertex AI 설정 ---
PROJECT_ID = "jc-gcp-project"
LOCATION = "us-central1"

# --- 모델 이름 설정 ---
STORY_MODEL = "gemini-2.5-flash"
IMAGEN_MODEL = "imagen-4.0-generate-preview-06-06"
VEO_MODEL = "veo-3.0-generate-preview"
LYRIA_MODEL_ENDPOINT = (
    f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}"
    f"/locations/us-central1/publishers/google/models/lyria-002:predict"
)

# --- TTS 목소리 설정 ---
TTS_VOICE_NAME = "ko-KR-Chirp3-HD-Callirrhoe" # https://cloud.google.com/text-to-speech/docs/list-voices-and-types


# --- 출력 디렉토리 설정 ---
OUTPUT_DIR = "veo_story_telling"
os.makedirs(OUTPUT_DIR, exist_ok=True)


# ==============================================================================
# 3. API 클라이언트 초기화 (CLIENT INITIALIZATION)
# ==============================================================================
# 각 API 클라이언트를 처음에 한 번만 생성하여 재사용합니다.
print("API 클라이언트를 초기화합니다...")
genai_client = genai.Client(vertexai=True, project=PROJECT_ID, location="global")
tts_client = texttospeech.TextToSpeechClient()
print("✅ API 클라이언트 초기화 완료.")


# ==============================================================================
# 4. 기능별 함수 정의 (FUNCTIONS)
# ==============================================================================

def generate_storyline(
    client: genai.Client,
    video_description: str,
    min_scenes: int = 1
) -> Dict:
    """1단계: Gemini를 사용하여 비디오 스토리라인을 생성합니다."""
    print("🎬 1단계: 스토리라인 생성 중...")
    
    # 함수 내에서 클라이언트를 생성하는 대신, 파라미터로 전달받아 사용합니다.
    model = STORY_MODEL
    
    storyline_schema = {
    "type": "OBJECT",
    "properties": {
        "description": {"type": "STRING", "description": "Summary of the video"},
        "music": {"type": "STRING", "description": "Description of music style (15 words or less)"},
        "scenes": {
            "type": "ARRAY",
            "description": f"An array of at least {min_scenes} scenes for the video.",
            "items": {
                "type": "OBJECT",
                "properties": {
                    "story": {"type": "STRING", "description": "Storyline for this specific scene"},
                    "script": {"type": "STRING", "description": "Voiceover script for this scene"}
                },
                "required": ["story", "script"]
            }
        }
    },
    "required": ["description", "music", "scenes"]}

    prompt = f"""You are a professional video producer.
                Create a storyline for video based on the content below.

                [VIDEO_DESCRIPTION]
                {video_description}

                Please create a storyline at least {min_scenes} scenes.
                Each scene should contain one key piece of information and flow naturally to the next scene.

                Use this JSON schema for output:
                {{
                    "description": "Summary of video",
                    "music": "Description of music style (15 words or less)",
                    "scenes": [
                        {{"story": "Storyline for the first scene", "script": "Script for voice"}},
                        {{"story": "Storyline for the second scene", "script": "Script for voice"}},
                        ...
                    ]
                }}"""
    
    contents = [types.Content(role="user", parts=[types.Part.from_text(text=prompt)])]

    generate_content_config = types.GenerateContentConfig(
        temperature=1,
        top_p=0.95,
        seed=0,
        max_output_tokens=8192,
        response_mime_type = "application/json",
        response_schema=storyline_schema,
        system_instruction=[types.Part.from_text(text="""Script must be in KOREAN. Music must be in English""")],
    )

    response = client.models.generate_content(
        model=model,
        contents=contents,
        config=generate_content_config
    )
    
    clean_text = response.text.strip().removeprefix("```json").removesuffix("```").strip()
    storyline = json.loads(clean_text)
    print('###### storyline 확인 하기 ######')
    print(storyline)
    print(f"✅ 스토리라인 생성 완료: {len(storyline['scenes'])}개 장면")
    return storyline

def generate_image_with_prior_context(
    client: genai.Client,
    scene_story: str,
    prior_image_path: Optional[str] = None,
    scene_idx: int = 0
) -> str:
    """2단계: 이전 이미지를 참고하여 다음 장면 이미지를 생성합니다."""
    print(f"🖼️  2단계: 장면 {scene_idx + 1} 이미지 생성 중...")
    
    if prior_image_path and os.path.exists(prior_image_path):
        prompt_text = f"""You are producing a video, and you want to create an image for the next scene.
                        The previous scene ended with the image below, and now you need to create an image for this scene:
                        [SCENE_STORY] {scene_story}
                        Please create a prompt for image generation that references the previous image for visual continuity and creates a natural transition.
                        Use this JSON schema for output: {{"prompt": "prompt for image generation with continuity"}}"""
        
        schema = {"type": "OBJECT", "properties": {"prompt": {"type": "STRING"}}, "required": ["prompt"]}
        
        with open(prior_image_path, "rb") as f:
            image_bytes = f.read()
        image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/png")
        
        response = client.models.generate_content(
            model=STORY_MODEL,
            contents=[prompt_text, image_part],
            config=types.GenerateContentConfig(response_mime_type="application/json", response_schema=schema)
        )
        image_prompt = json.loads(response.text)["prompt"]
    else:
        image_prompt = f"Create a high-quality cinematic image for this scene: {scene_story}. The image should be suitable for video generation."

    response_imagen = client.models.generate_images(
        model=IMAGEN_MODEL,
        prompt=image_prompt,
        config=types.GenerateImagesConfig(aspect_ratio="16:9", number_of_images=1),
    )
    
    image_path = f"{OUTPUT_DIR}/scene_{scene_idx}_image.png"
    generated_image_bytes = response_imagen.generated_images[0].image.image_bytes
    with open(image_path, "wb") as f:
        f.write(generated_image_bytes)

    print(f"✅ 장면 {scene_idx + 1} 이미지 생성 완료: {image_path}")
    return image_path

def generate_video_from_image(
    client: genai.Client,
    image_path: str,
    scene_story: str,
    scene_idx: int
) -> str:
    """3단계: 생성된 이미지로부터 VEO 비디오를 생성합니다. [Image 클래스 복원됨]"""
    print(f"📹 3단계: 장면 {scene_idx + 1} 비디오 생성 중...")

    # --- [수정] VEO API가 요구하는 특정 객체 구조를 위해 내부 Image 클래스를 복원합니다 ---
    # 이 클래스는 generate_videos 함수의 image 파라미터와 호환됩니다.
    import mimetypes

    class ImageForVEO:
        def __init__(self, gcs_uri=None, image_bytes=None, mime_type=None):
            self.gcs_uri = gcs_uri
            self.image_bytes = image_bytes
            self.mime_type = mime_type

    try:
        mime_type, _ = mimetypes.guess_type(image_path)
        if not mime_type:
            raise ValueError(f"Could not determine MIME type for {image_path}")

        with open(image_path, "rb") as image_file:
            image_byte_data = image_file.read()

        # API가 요구하는 형식의 이미지 인스턴스 생성
        image_instance = ImageForVEO(
            gcs_uri=None,
            image_bytes=image_byte_data,
            mime_type=mime_type
        )
        
        prompt = f"""Generate a video from this image that shows: {scene_story}.
                    The video should be smooth and cinematic, lasting 3-5 seconds."""
        
        operation = client.models.generate_videos(
            model=VEO_MODEL,
            prompt=prompt,
            image=image_instance, # 복원된 인스턴스 사용
        )

        print("...VEO가 비디오를 생성하는 동안 대기합니다 (약 1~2분 소요)...")
        while not operation.done:
            time.sleep(10)
            operation = client.operations.get(operation)

        if operation.error:
            error_message = getattr(operation.error, 'message', str(operation.error))
            raise Exception(f"비디오 생성 실패: {error_message}")

        video_bytes = operation.response.generated_videos[0].video.video_bytes
        video_path = f"{OUTPUT_DIR}/scene_{scene_idx}_video.mp4"
        with open(video_path, 'wb') as f:
            f.write(video_bytes)
        
        print(f"✅ 장면 {scene_idx + 1} 비디오 저장 완료: '{video_path}'")
        return video_path

    except Exception as e:
        print(f"❌ 장면 {scene_idx + 1} 비디오 생성 중 오류 발생: {e}")
        return "" # 오류 발생 시 빈 경로 반환


def generate_tts_audio(
    client: texttospeech.TextToSpeechClient,
    script: str,
    scene_idx: int,
    lang_code: str = "ko-KR"
) -> str:
    """4단계: TTS 오디오를 생성합니다."""
    print(f"🗣️  4단계: 장면 {scene_idx + 1} TTS 오디오 생성 중...")
    
    synthesis_input = texttospeech.SynthesisInput(text=script)
    voice = texttospeech.VoiceSelectionParams(language_code=lang_code, name=TTS_VOICE_NAME)
    audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
    
    response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
    
    audio_path = f"{OUTPUT_DIR}/scene_{scene_idx}_audio.mp3"
    with open(audio_path, "wb") as out:
        out.write(response.audio_content)
    
    print(f"✅ 장면 {scene_idx + 1} TTS 생성 완료: {audio_path}")
    return audio_path

def generate_background_music(music_description: str, duration: int) -> str:
    """5단계: Lyria API를 사용하여 배경 음악을 생성합니다."""
    print(f"🎵 5단계: 배경음악 생성 중... (설명: '{music_description}')")
    
    def send_request_with_retry(data=None, max_retries=3):
        creds, _ = google.auth.default()
        auth_req = google.auth.transport.requests.Request()
        headers = {"Content-Type": "application/json"}

        for attempt in range(max_retries):
            try:
                creds.refresh(auth_req)
                headers["Authorization"] = f"Bearer {creds.token}"
                response = requests.post(LYRIA_MODEL_ENDPOINT, headers=headers, json=data)
                response.raise_for_status()
                return response.json()
            except HTTPError as e:
                print(f"❌ HTTP 에러 (시도 {attempt + 1}/{max_retries}): {e}")
                if e.response:
                    print(f"  - 서버 응답: {e.response.text}")
                if attempt + 1 < max_retries:
                    time.sleep(5)
                else:
                    raise
        raise Exception("API 요청 최종 실패")

    request_data = {
      "instances": [{"prompt": music_description, "negative_prompt": "dark"}],
      "parameters": {"duration_seconds": int(duration), "sample_count": 1}
    }
    
    try:
        predictions = send_request_with_retry(request_data)["predictions"]
        b64_audio_data = predictions[0]['bytesBase64Encoded']
        music_path = f"{OUTPUT_DIR}/background_music.mp3"
        with open(music_path, "wb") as f:
            f.write(base64.b64decode(b64_audio_data))
        print(f"✅ 배경음악 생성 완료: {music_path}")
        return music_path
    except Exception as e:
        print(f"🚨 배경음악 생성 실패: {e}. 배경음악 없이 진행합니다.")
        return ""

def extract_last_frame(video_path: str) -> Optional[str]:
    """영상에서 마지막 프레임을 추출하여 다음 장면 생성에 사용합니다."""
    if not os.path.exists(video_path):
        print(f"⚠️ 마지막 프레임 추출 건너뛰기: '{video_path}' 파일 없음.")
        return None
    try:
        with VideoFileClip(video_path) as clip:
            last_frame_time = clip.duration - (1 / clip.fps)
            last_frame = clip.get_frame(last_frame_time)
            
            last_frame_image = Image.fromarray(last_frame)
            last_frame_path = f"{OUTPUT_DIR}/last_frame_{int(time.time())}.png"
            last_frame_image.save(last_frame_path)
            
            print(f"✅ 마지막 프레임 추출 완료: {last_frame_path}")
            return last_frame_path
    except Exception as e:
        print(f"❌ 마지막 프레임 추출 실패: {e}")
        return None

def combine_video_clips(
    video_paths: list[str],
    audio_paths: list[str],
    background_music_path: str
) -> str:
    """6단계: 모든 비디오, 오디오, 배경음악을 병합합니다."""
    print("🖇️  6단계: 최종 영상 병합 시작...")

    def sanitize_video_clip(video):
        """비디오의 실제 사용 가능한 길이를 찾아 클립을 안정화시킵니다."""
        try:
            if not all([hasattr(video, 'fps'), hasattr(video, 'duration'), video.fps, video.duration]):
                return video
            frame_time = 1 / video.fps
            real_duration = video.duration
            for frame_number in reversed(range(int(video.fps * video.duration))):
                try:
                    current_time = frame_number * frame_time
                    video.get_frame(current_time)
                    real_duration = current_time + frame_time
                    break
                except:
                    pass
            if abs(real_duration - video.duration) > frame_time:
                print(f"✂️ 클립 안정화: {video.duration:.2f}s -> {real_duration:.2f}s")
                return video.subclip(0, real_duration)
            return video
        except Exception as e:
            print(f"🚨 클립 안정화 중 오류: {e}. 원본 클립 사용.")
            return video

    video_clips = []
    for video_path, audio_path in zip(video_paths, audio_paths):
        try:
            if not os.path.exists(video_path) or os.path.getsize(video_path) < 1024:
                print(f"⚠️ 파일이 없거나 손상되어 건너뜁니다: {video_path}")
                continue

            video_clip = VideoFileClip(video_path)
            video_clip = sanitize_video_clip(video_clip)
            audio_clip = AudioFileClip(audio_path)
            
            if video_clip.duration < audio_clip.duration:
                speed_factor = video_clip.duration / audio_clip.duration
                video_clip = vfx.speedx(video_clip, factor=speed_factor).set_duration(audio_clip.duration)
            else:
                video_clip = video_clip.subclip(0, audio_clip.duration)

            video_clips.append(video_clip.set_audio(audio_clip))

        except Exception as e:
            print(f"🚨 파일 처리 중 오류 발생. 해당 클립을 건너뜁니다: {video_path} ({e})")
            continue

    if not video_clips:
        print("🚨 처리할 유효한 클립이 없어 영상 생성을 중단합니다.")
        return ""
    
    final_video = concatenate_videoclips(video_clips)
    
    if background_music_path and os.path.exists(background_music_path):
        bg_music_original = AudioFileClip(background_music_path).volumex(0.3)
        bg_music = afx.audio_loop(bg_music_original, duration=final_video.duration)
        # final_video.set_audio(CompositeAudioClip([final_video.audio, bg_music]))
        final_video = final_video.set_audio(CompositeAudioClip([final_video.audio, bg_music]))


    output_path = f"{OUTPUT_DIR}/final_video_with_continuity.mp4"
    final_video.write_videofile(
        output_path, codec='libx264', audio_codec='aac', threads=4, preset='medium'
    )
    
    print(f"✅ 최종 영상 생성 완료: {output_path}")
    return output_path


# ==============================================================================
# 5. 전체 파이프라인 실행 함수 (MAIN ORCHESTRATOR)
# ==============================================================================

def run_full_pipeline_with_continuity(
    g_client: genai.Client,
    t_client: texttospeech.TextToSpeechClient,
    video_description: str,
    lang_code: str = "ko-KR"
) -> str:
    """연속성을 고려한 전체 멀티모달 영상 생성 파이프라인을 실행합니다."""
    print("\n🚀 연속성 있는 영상 생성 파이프라인 시작...")
    start_time = time.time()
    
    storyline = generate_storyline(g_client, video_description)
    
    video_paths = []
    audio_paths = []
    prior_image_path = None
    
    for idx, scene in enumerate(storyline['scenes']):
        print(f"\n--- SCENE {idx + 1}/{len(storyline['scenes'])} 처리 시작 ---")
        
        image_path = generate_image_with_prior_context(g_client, scene['story'], prior_image_path, idx)
        video_path = generate_video_from_image(g_client, image_path, scene['story'], idx)
        audio_path = generate_tts_audio(t_client, scene['script'], idx, lang_code)
        
        video_paths.append(video_path)
        audio_paths.append(audio_path)
        
        prior_image_path = extract_last_frame(video_path)

    background_music_path = generate_background_music(storyline['music'], 30)
    
    final_video_path = combine_video_clips(video_paths, audio_paths, background_music_path)
    
    elapsed_time = time.time() - start_time
    print(f"\n🎉 파이프라인 완료! 총 소요시간: {elapsed_time:.2f}초")
    print(f"📁 최종 영상: {final_video_path}")
    
    return final_video_path

API 클라이언트를 초기화합니다...
✅ API 클라이언트 초기화 완료.


## Test!

In [11]:
# --- 여기에 만들고 싶은 영상에 대한 설명을 입력하세요 --- # 태양계
video_description_prompt = """
Create a short, epic video journey through our solar system.
The video should travel outwards from the Sun, passing by Mercury, Venus, Earth, Mars, and then flying through the asteroid belt towards Jupiter.
Make it awe-inspiring and suitable for a documentary opening.
Each scene should transition smoothly to the next.
"""

# 파이프라인 실행
# 전역으로 생성된 클라이언트 객체를 전달합니다.
final_video = run_full_pipeline_with_continuity(
    g_client=genai_client,
    t_client=tts_client,
    video_description=video_description_prompt
)

if final_video:
    print(f"\n✅ 최종 비디오가 성공적으로 생성되었습니다: {final_video}")
else:
    print("\n🚨 최종 비디오 생성에 실패했습니다.")


🚀 연속성 있는 영상 생성 파이프라인 시작...
🎬 1단계: 스토리라인 생성 중...
✅ 스토리라인 생성 완료: 7개 장면

--- SCENE 1/7 처리 시작 ---
🖼️  2단계: 장면 1 이미지 생성 중...
✅ 장면 1 이미지 생성 완료: veo_story_telling/scene_0_image.png
📹 3단계: 장면 1 비디오 생성 중...
...VEO가 비디오를 생성하는 동안 대기합니다 (약 1~2분 소요)...
✅ 장면 1 비디오 저장 완료: 'veo_story_telling/scene_0_video.mp4'
🗣️  4단계: 장면 1 TTS 오디오 생성 중...
✅ 장면 1 TTS 생성 완료: veo_story_telling/scene_0_audio.mp3
✅ 마지막 프레임 추출 완료: veo_story_telling/last_frame_1756426293.png

--- SCENE 2/7 처리 시작 ---
🖼️  2단계: 장면 2 이미지 생성 중...
✅ 장면 2 이미지 생성 완료: veo_story_telling/scene_1_image.png
📹 3단계: 장면 2 비디오 생성 중...
...VEO가 비디오를 생성하는 동안 대기합니다 (약 1~2분 소요)...
✅ 장면 2 비디오 저장 완료: 'veo_story_telling/scene_1_video.mp4'
🗣️  4단계: 장면 2 TTS 오디오 생성 중...
✅ 장면 2 TTS 생성 완료: veo_story_telling/scene_1_audio.mp3
✅ 마지막 프레임 추출 완료: veo_story_telling/last_frame_1756426385.png

--- SCENE 3/7 처리 시작 ---
🖼️  2단계: 장면 3 이미지 생성 중...
✅ 장면 3 이미지 생성 완료: veo_story_telling/scene_2_image.png
📹 3단계: 장면 3 비디오 생성 중...
...VEO가 비디오를 생성하는 동안 대기합니다 (약 1~2분 소요)...
✅ 장면 3 비디오 저장 완

                                                                     

MoviePy - Done.
Moviepy - Writing video veo_story_telling/final_video_with_continuity.mp4



                                                                

Moviepy - Done !
Moviepy - video ready veo_story_telling/final_video_with_continuity.mp4
✅ 최종 영상 생성 완료: veo_story_telling/final_video_with_continuity.mp4

🎉 파이프라인 완료! 총 소요시간: 716.54초
📁 최종 영상: veo_story_telling/final_video_with_continuity.mp4

✅ 최종 비디오가 성공적으로 생성되었습니다: veo_story_telling/final_video_with_continuity.mp4


In [None]:
# # --- 여기에 만들고 싶은 영상에 대한 설명을 입력하세요 ---
# video_description_prompt = """
# Create a short, dynamic time-lapse video of a skyscraper being built.
# The video should show the progression from an empty construction site with foundations being laid, to the steel framework rising floor by floor, then the glass exterior being installed, and finally the completed tower standing tall in the city skyline.
# Make it feel powerful and modern.
# Each scene should transition smoothly to the next.
# """

# # 파이프라인 실행
# # 전역으로 생성된 클라이언트 객체를 전달합니다.
# final_video = run_full_pipeline_with_continuity(
#     g_client=genai_client,
#     t_client=tts_client,
#     video_description=video_description_prompt
# )

# if final_video:
#     print(f"\n✅ 최종 비디오가 성공적으로 생성되었습니다: {final_video}")
# else:
#     print("\n🚨 최종 비디오 생성에 실패했습니다.")

In [None]:
# # --- 여기에 만들고 싶은 영상에 대한 설명을 입력하세요 --- ## 책만들기
# video_description_prompt = """
# Create a short educational video on how a book is made.
# The video should flow from a large roll of paper being fed into a printing press, to the printed pages being cut and stacked, then the cover being attached, and finally the finished books being boxed for shipping.
# Make it informative and fascinating.
# Each scene should transition smoothly to the next.
# """

# # 파이프라인 실행
# # 전역으로 생성된 클라이언트 객체를 전달합니다.
# final_video = run_full_pipeline_with_continuity(
#     g_client=genai_client,
#     t_client=tts_client,
#     video_description=video_description_prompt
# )

# if final_video:
#     print(f"\n✅ 최종 비디오가 성공적으로 생성되었습니다: {final_video}")
# else:
#     print("\n🚨 최종 비디오 생성에 실패했습니다.")