## CE Enablement Session - Practice 03
- ## Workflow: Story telling: Gemini --> VEO

### 1. (권장) 가상환경 생성 및 활성화
python -m venv venv
#### macOS/Linux:
source venv/bin/activate

#### 2. 필수 파이썬 라이브러리 설치
pip install --upgrade google-cloud-aiplatform google-cloud-texttospeech moviepy Pillow requests ipython

In [1]:
# ==============================================================================
# 1. 패키지 임포트 및 전역 설정
# ==============================================================================
import os
import json
import base64
import io
import mimetypes
import time
from typing import Dict, List, Optional

# Third-party libraries
import requests
from PIL import Image as PIL_Image  # 'Image' 이름 충돌을 피하기 위해 'PIL_Image'로 별칭 지정
from moviepy.editor import (
    VideoFileClip, AudioFileClip, CompositeVideoClip,
    concatenate_videoclips, ImageClip, CompositeAudioClip
)
from moviepy.video.fx.all import speedx
from moviepy.audio.fx.all import audio_loop

# Google Cloud and Generative AI libraries
import google.auth
import google.auth.transport.requests
from google.cloud import texttospeech
from google import genai
from google.genai import types
from google.genai.types import (
    EditImageConfig,
    GenerateImagesConfig,
    Image,
    MaskReferenceConfig,
    MaskReferenceImage,
    RawReferenceImage,
)
from IPython.display import Audio # Jupyter/Colab 환경에서 오디오 재생용

# --- 전역 상수 정의 ---
PROJECT_ID = "jc-gcp-project"
LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")
OUTPUT_DIR = "veo_story_telling"
LYRIA_ENDPOINT_URL = (
    f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}"
    "/locations/us-central1/publishers/google/models/lyria-002:predict"
)

# --- 전역 변수 및 초기화 ---
os.makedirs(OUTPUT_DIR, exist_ok=True)

try:
    # API 클라이언트를 스크립트 시작 시 한 번만 초기화하여 모든 함수에서 재사용합니다.
    CLIENT = genai.Client(vertexai=True, project=PROJECT_ID, location="global")
    print("✅ GenAI 클라이언트가 성공적으로 초기화되었습니다.")
except Exception as e:
    print(f"❌ GenAI 클라이언트 초기화 실패: {e}")
    CLIENT = None

✅ GenAI 클라이언트가 성공적으로 초기화되었습니다.


In [2]:
# ==============================================================================
# 2. 헬퍼 함수 (Helper Functions)
# ==============================================================================

def _send_lyria_request(data=None) -> dict:
    """Lyria (음악 생성) API에 요청을 보내고 결과를 반환합니다."""
    creds, _ = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    headers = {
        "Authorization": f"Bearer {creds.token}",
        "Content-Type": "application/json",
    }
    response = requests.post(LYRIA_ENDPOINT_URL, headers=headers, json=data)
    response.raise_for_status()
    return response.json()

def _poll_and_save_video(operation: any, scene_idx: int) -> str:
    """VEO 영상 생성이 완료될 때까지 대기하고 결과 파일을 저장합니다."""
    while not operation.done:
        print("Waiting for video generation/animation to complete...")
        time.sleep(10)
        operation = CLIENT.operations.get(operation)

    if operation.done and not operation.error and hasattr(operation, 'response') and operation.response.generated_videos:
        video_bytes = operation.response.generated_videos[0].video.video_bytes
        video_path = f"{OUTPUT_DIR}/scene_{scene_idx}_video.mp4"
        try:
            with open(video_path, 'wb') as f:
                f.write(video_bytes)
            print(f"✅ Video saved successfully to '{video_path}'")
            return video_path
        except IOError as e:
            raise Exception(f"Error saving video file for scene {scene_idx}: {e}")
    else:
        error_message = f"Video generation failed for scene {scene_idx}."
        if hasattr(operation, 'error') and operation.error:
            error_message += f" Reason: {operation.error}"
        else:
            error_message += " Reason: The operation may have been blocked by safety filters or returned no content."
        raise Exception(error_message)

def _send_lyria_request(data=None, max_retries=3) -> dict:
    """Lyria (음악 생성) API에 재시도 로직을 포함하여 요청을 보내고 결과를 반환합니다."""
    creds, _ = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    headers = {"Content-Type": "application/json"}

    for attempt in range(max_retries):
        try:
            creds.refresh(auth_req)
            headers["Authorization"] = f"Bearer {creds.token}"
            
            response = requests.post(LYRIA_ENDPOINT_URL, headers=headers, json=data)
            response.raise_for_status()
            return response.json()
        
        except requests.exceptions.HTTPError as e:
            print(f"❌ Lyria API 요청 실패 (시도 {attempt + 1}/{max_retries}): {e}")
            # 서버가 보낸 구체적인 오류 메시지를 출력하면 디버깅에 매우 유용합니다.
            if e.response is not None:
                print(f"    - 서버 응답: {e.response.text}")
            
            if attempt + 1 == max_retries:
                print("🚨 최종 재시도에 실패했습니다.")
                raise
            
            print(f"    - 5초 후 재시도합니다...")
            time.sleep(5)
    
    raise Exception("API 요청이 모든 재시도 끝에 실패했습니다.")

In [3]:
# ==============================================================================
# 3. 핵심 생성 함수 (Core Generative Functions)
# ==============================================================================

def generate_storyline(video_description: str, min_scenes: int = 5, initial_image_path: Optional[str] = None) -> Dict:
    """1단계: 스토리라인을 생성합니다. (수정됨)"""
    print("--- 1단계: 스토리라인 생성 중... ---")
    if not CLIENT:
        raise Exception("GenAI Client is not initialized.")

    model = "gemini-2.5-pro"
    parts = []

    # --- ✅ 수정된 부분: f-string을 사용해 video_description을 프롬프트에 직접 삽입 ---
    prompt = f"""You are a highly skilled creative content planner. Your primary goal is to fulfill the user's request for a video storyline based on their exact description.

    [USER'S VIDEO DESCRIPTION]
    {video_description}

    Create a compelling storyline for a short, cinematic video based ONLY on the user's description above.
    Do not invent a new story that deviates from the user's explicit request.

    If an initial image is provided, analyze its mood and style, and integrate them seamlessly into the tone. However, the core narrative MUST come from the [USER'S VIDEO DESCRIPTION].

    Your storyline must:
    - Directly reflect the [USER'S VIDEO DESCRIPTION].
    - Be cinematic, visually engaging, and suitable for a professional video.
    - Suggest a music style that complements the story.
    - The script should provide appropriate voiceover or dialogue for each scene.
    - **Absolutely avoid generating any content that is political, violent, sexually explicit, promotes hate speech, depicts self-harm, involves children in inappropriate contexts, or is otherwise harmful or unethical.**
    """

    if initial_image_path and os.path.exists(initial_image_path):
        print("--- 초기 이미지를 분석하여 스토리라인을 생성합니다 ---")
        with open(initial_image_path, "rb") as f:
            image_bytes = f.read()
        parts.append(types.Part.from_text(text=prompt))
        parts.append(types.Part.from_bytes(data=image_bytes, mime_type="image/png"))
    else:
        print("--- 텍스트 설명으로 스토리라인을 생성합니다 ---")
        parts.append(types.Part.from_text(text=prompt))

    contents = [types.Content(role="user", parts=parts)]
    
    # (Schema 및 Config 설정은 기존과 동일)
    storyline_schema = {"type": "OBJECT", "properties": { "description": {"type": "STRING"}, "music": {"type": "STRING"}, "scenes": {"type": "ARRAY", "items": {"type": "OBJECT", "properties": {"story": {"type": "STRING"}, "script": {"type": "STRING"}}}}}}
    generate_content_config = types.GenerateContentConfig(temperature=1, top_p=0.95, response_mime_type="application/json", response_schema=storyline_schema, system_instruction=[types.Part.from_text(text="Script must be in KOREAN. Music must be in English")])

    response = CLIENT.models.generate_content(model=model, contents=contents, config=generate_content_config)
    
    clean_text = response.text.strip().removeprefix("```json").removesuffix("```").strip()
    storyline = json.loads(clean_text)
    
    print(f"✅ 스토리라인 생성 완료: {len(storyline['scenes'])}개 장면")
    print(storyline)
    print()
    return storyline

def generate_character_sheet(story_description: str) -> str:
    """이야기 설명을 바탕으로 주인공의 상세한 외모 설명(캐릭터 시트)을 생성합니다. (수정됨)"""
    print("--- 📝 주인공 캐릭터 시트 생성 중... ---")
    if not CLIENT:
        raise Exception("GenAI Client is not initialized.")
        
    model = "gemini-2.5-flash"
    
    # --- ✅ 수정된 부분: f-string을 사용해 story_description을 프롬프트에 직접 삽입 ---
    prompt = f"""
    Based on the following story summary, create a highly detailed visual description for the main character(s).
    This description will be used as a consistent reference for an animation AI.
    Describe their physical appearance, clothing, color palette, key features, and overall style.
    The description must be detailed enough for any artist to replicate their appearance precisely.
    The output should be a single paragraph of text.

    [STORY SUMMARY]
    {story_description}
    """
    
    response = CLIENT.models.generate_content(model=model, contents=prompt)
    character_sheet = response.text.strip()
    print(f"✅ 캐릭터 시트 생성 완료:\n{character_sheet}\n")
    return character_sheet

def create_detailed_veo_prompt(character_sheet: str, scene_story: str, animation_style: str) -> str:
    """캐릭터 시트, 장면 설명, 스타일을 조합하여 VEO를 위한 상세 프롬프트를 생성합니다. (수정됨)"""
    print("--- 🤖 VEO를 위한 상세 프롬프트 생성 중... ---")
    if not CLIENT:
        raise Exception("GenAI Client is not initialized.")

    model = "gemini-2.5-flash"

    # --- ✅ 수정된 부분: f-string을 사용해 모든 변수를 프롬프트에 직접 삽입 ---
    prompt = f"""
    You are an expert animation director. Your task is to combine the following elements into a single, highly detailed, and coherent prompt paragraph for a text-to-video AI (VEO).

    1.  **Animation Style (Must adhere to this):** {animation_style}

    2.  **Consistent Character Description (Crucial to follow precisely for every detail):** {character_sheet}

    3.  **Current Scene Description (The action to portray):** {scene_story}

    Combine these into a rich, descriptive paragraph. Explicitly describe the character's appearance based on the character sheet, their actions from the scene story, the camera angle (e.g., close-up, wide shot, point-of-view), the lighting (e.g., dramatic, soft morning light), and the overall mood. The final output must be a single paragraph prompt ready for the video model.
    """
    response = CLIENT.models.generate_content(model=model, contents=prompt)
    detailed_prompt = response.text.strip()
    print(f"✅ 생성된 상세 프롬프트:\n{detailed_prompt}\n")
    return detailed_prompt

def generate_video_from_text(detailed_prompt: str, scene_idx: int) -> str:
    """미리 생성된 상세 프롬프트를 받아 VEO 영상을 생성합니다."""
    print(f"--- 상세 프롬프트로 Scene {scene_idx+1} 영상 생성 중 ---")
    if not CLIENT:
        raise Exception("GenAI Client is not initialized.")
        
    operation = CLIENT.models.generate_videos(model="veo-3.0-generate-preview", prompt=detailed_prompt)
    return _poll_and_save_video(operation, scene_idx)

def generate_video_from_image(image_path: str, scene_story: str, scene_idx: int) -> str:
    """제공된 이미지를 애니메이션화하여 비디오를 생성합니다."""
    print(f"--- Scene {scene_idx+1} 초기 이미지 애니메이션화 중 ---")
    if not CLIENT:
        raise Exception("GenAI Client is not initialized.")

    try:
        mime_type, _ = mimetypes.guess_type(image_path)
        if not mime_type: raise ValueError(f"Could not determine MIME type for {image_path}")
        with open(image_path, "rb") as f: image_byte_data = f.read()
        image_part = types.Part.from_bytes(data=image_byte_data, mime_type=mime_type)
        print("✅ 애니메이션용 이미지 객체 생성 완료!")
    except Exception as e:
        raise Exception(f"이미지 준비 중 오류 발생: {e}")
        
    prompt = f"Animate this image. The scene is about: '{scene_story}'... (프롬프트 내용은 동일)"
    operation = CLIENT.models.generate_videos(model="veo-3.0-generate-preview", prompt=prompt, image=image_part)
    return _poll_and_save_video(operation, scene_idx)

def generate_tts_audio(script: str, scene_idx: int, lang_code: str = "ko-KR") -> str:
    """TTS 오디오를 생성합니다."""
    # (코드는 기존과 동일)
    client = texttospeech.TextToSpeechClient()
    synthesis_input = texttospeech.SynthesisInput(text=script)
    voice = texttospeech.VoiceSelectionParams(language_code=lang_code, name="ko-KR-Chirp3-HD-Callirrhoe")
    audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
    response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
    audio_path = f"{OUTPUT_DIR}/scene_{scene_idx}_audio.mp3"
    with open(audio_path, "wb") as out:
        out.write(response.audio_content)
    print(f"✅ 장면 {scene_idx+1} TTS 생성 완료: {audio_path}")
    return audio_path

def generate_background_music(music_description: str, duration: float) -> str:
    """Lyria API를 호출하여 배경음악을 생성합니다. (요청 구조 원복 및 재시도 적용)"""
    print("--- 🎼 배경음악 생성 중... ---")
    
    # ✅✅✅ 수정된 부분: 성공 예시에 맞춰 요청 구조를 원복합니다. ✅✅✅
    # duration_seconds가 'instances' 내에 위치합니다.
    music_request = {
        "prompt": f"Generate cinematic film score music: {music_description}",
        "negative_prompt": "dark",
        "sample_count": 1,
        "duration_seconds": int(duration)
    }
    request_data = {"instances": [music_request], "parameters": {}}
    
    print(f"요청 데이터: {request_data}")
    
    # 재시도 로직이 포함된 헬퍼 함수를 호출합니다.
    predictions = _send_lyria_request(request_data)["predictions"]
    
    b64_audio_data = predictions[0]['bytesBase64Encoded']
    decoded_audio_data = base64.b64decode(b64_audio_data)
    music_path = f"{OUTPUT_DIR}/background_music.mp3"
    with open(music_path, "wb") as out:
        out.write(decoded_audio_data)
    print(f"✅ 배경음악 생성 완료: {music_path}")
    return music_path

def combine_video_clips_with_smooth_transitions(video_paths: List[str], audio_paths: List[str], background_music_path: str) -> str:
    """생성된 비디오, 오디오, 배경음악을 최종 영상으로 병합합니다."""
    # (코드는 기존과 동일)
    video_clips = []
    for video_path, audio_path in zip(video_paths, audio_paths):
        video_clip = VideoFileClip(video_path)
        audio_clip = AudioFileClip(audio_path)
        # (길이 조절 로직)
        if video_clip.duration < audio_clip.duration:
            speed_factor = video_clip.duration / audio_clip.duration
            video_clip = speedx(video_clip, factor=speed_factor).set_duration(audio_clip.duration)
        else:
            video_clip = video_clip.subclip(0, audio_clip.duration)
        video_with_audio = video_clip.set_audio(audio_clip)
        video_clips.append(video_with_audio)
    
    final_video = concatenate_videoclips(video_clips, method="compose")
    
    if os.path.exists(background_music_path):
        bg_music = AudioFileClip(background_music_path).volumex(0.3)
        if bg_music.duration < final_video.duration:
            bg_music = audio_loop(bg_music, duration=final_video.duration)
        else:
            bg_music = bg_music.subclip(0, final_video.duration)
        
        final_audio = CompositeAudioClip([final_video.audio, bg_music])
        final_video.audio = final_audio
        
    output_path = f"{OUTPUT_DIR}/final_video_with_continuity.mp4"
    final_video.write_videofile(output_path, codec='libx264', audio_codec='aac', threads=4, preset='medium', logger=None)
    print(f"✅ 최종 영상 생성 완료: {output_path}")
    return output_path

In [4]:
# ==============================================================================
# 4. 메인 파이프라인 실행 함수
# ==============================================================================

def run_full_pipeline_with_continuity(
    video_description: str,
    initial_image_path: Optional[str] = None,
    lang_code: str = "ko-KR",
    animation_style: str = "A vibrant and polished 3D animation in the style of Pixar, with expressive characters and detailed textures."
) -> Optional[str]:
    
    print(f"🚀 Starting pipeline with '{animation_style}' style...")
    start_time = time.time()
    
    try:
        storyline = generate_storyline(video_description, initial_image_path=initial_image_path)
        character_sheet = generate_character_sheet(storyline['description'])
        
        video_paths = []
        audio_paths = []
        total_audio_duration = 0
        
        for idx, scene in enumerate(storyline['scenes']):
            print(f"\n🎬 Processing Scene {idx + 1}/{len(storyline['scenes'])}...")
            
            video_path = ""
            if idx == 0 and initial_image_path and os.path.exists(initial_image_path):
                video_path = generate_video_from_image(initial_image_path, scene['story'], idx)
            else:
                detailed_prompt = create_detailed_veo_prompt(
                    character_sheet=character_sheet,
                    scene_story=scene['story'],
                    animation_style=animation_style
                )
                video_path = generate_video_from_text(detailed_prompt, idx)
            
            video_paths.append(video_path)
        
            audio_path = generate_tts_audio(scene['script'], idx, lang_code)
            audio_paths.append(audio_path)
            with AudioFileClip(audio_path) as audio_clip:
                total_audio_duration += audio_clip.duration
        
        background_music_path = generate_background_music(storyline['music'], int(30))
        
        print("\n🎞️ Merging all clips into the final video...")
        final_video_path = combine_video_clips_with_smooth_transitions(video_paths, audio_paths, background_music_path)
        
        elapsed_time = time.time() - start_time
        print(f"\n🎉 Pipeline complete! Total time: {elapsed_time:.2f} seconds")
        print(f"📁 Final video available at: {final_video_path}")
        
        return final_video_path

    except Exception as e:
        print(f"\n❌ 파이프라인 실행 중 심각한 오류가 발생하여 중단합니다.")
        print(f"오류 상세 정보: {e}")
        return None

## Test!

In [5]:
final_video = run_full_pipeline_with_continuity(
    video_description="한 젊은 개발자가 AI 동반자와 함께 코딩하는 이야기",
    animation_style="A vibrant and polished 3D animation in the style of Pixar, with expressive characters and detailed textures."
)

🚀 Starting pipeline with 'A vibrant and polished 3D animation in the style of Pixar, with expressive characters and detailed textures.' style...
--- 1단계: 스토리라인 생성 중... ---
--- 텍스트 설명으로 스토리라인을 생성합니다 ---
✅ 스토리라인 생성 완료: 5개 장면
{'description': '한 젊은 개발자가 깊은 밤, 복잡한 코딩 문제에 부딪히지만, 그의 AI 동반자와 협력하여 문제를 해결하고 함께 성장하는 과정을 감성적이고 시네마틱하게 담아낸 영상입니다.', 'music': 'Inspiring and uplifting electronic music with a mellow beat', 'scenes': [{'story': '어두운 방, 모니터 불빛만이 젊은 개발자의 지친 얼굴을 비춥니다. 그는 복잡한 코드 앞에서 좌절하고 있습니다. 화면에는 에러 메시지가 가득합니다.', 'script': '세상은 코드로 이루어져 있고, 나는 그 세상의 건축가다. 하지만 때론, 가장 견고한 설계도 한 줄의 오류 앞에 무너져 내린다.'}, {'story': '개발자가 한숨을 쉬며 AI 동반자 프로그램을 실행합니다. 화면 한쪽에 세련되고 미니멀한 UI가 나타나며, 빛의 파동 같은 형태로 AI가 활성화됩니다.', 'script': '혼자라고 느껴질 때, 나는 나의 가장 특별한 파트너를 부른다.'}, {'story': '개발자와 AI가 함께 코딩하는 장면이 빠르게 교차됩니다. 개발자의 손가락이 키보드 위를 날아다니고, AI는 실시간으로 코드 구조를 시각화하고 최적의 해결책을 제안합니다. 둘의 협업이 마치 한 편의 춤처럼 보입니다.', 'script': '우리의 언어는 논리. 우리의 대화는 데이터. 복잡한 문제 속에서 우리는 함께 길을 찾는다.'}, {'story': "마침내 에러 메시지가 사라지고 '컴파일 성공'이라는 문구가 화면에 뜹니다. 개발자

In [6]:
# run_full_pipeline_with_continuity(
#     video_description="숙제를 못하다가 새로운 노트북을 사고 갑자기 모범생이되는 노트북 광고 만들어줘.",
#     animation_style="A vibrant and polished 3D animation in the style of Pixar, with expressive characters and detailed textures."
# )

In [7]:
# run_full_pipeline_with_continuity(
#     video_description="귀여운 제비가 좋은 소식을 가져다 주는 이야기. 우체국 광고에 쓸 수 있는 스토리로 만들어줘.",
#     animation_style="A vibrant and polished 3D animation in the style of Pixar, with expressive characters and detailed textures."
# )

In [8]:
# final_video = run_full_pipeline_with_continuity(
#     video_description="예기치 못한 인생의 비바람을 막아주는 가장 튼튼한 울타리를 보여주는 'xx건설' 광고",
#     animation_style="A vibrant and polished 3D animation in the style of Pixar, with expressive characters and detailed textures."
# )