In [None]:
import subprocess

def find_zombie_processes():
    """Find all zombie processes."""
    try:
        result = subprocess.run(["ps", "aux"], capture_output=True, text=True)
        lines = result.stdout.split("\n")

        zombie_pids = []
        for line in lines:
            parts = line.split()
            if len(parts) > 7 and parts[7] == "Z":
                pid = parts[1]  # PID is the second column
                zombie_pids.append(pid)

        return zombie_pids
    except Exception as e:
        print(f"Error finding zombie processes: {e}")
        return []

def kill_processes(pids):
    """Kill processes by PID."""
    for pid in pids:
        try:
            subprocess.run(["sudo", "kill", "-9", pid], check=True)
            print(f"✅ Killed zombie process: {pid}")
        except Exception as e:
            print(f"❌ Failed to kill process {pid}: {e}")

if __name__ == "__main__":
    zombie_pids = find_zombie_processes()
    if zombie_pids:
        print(f"🧟 Found zombie processes: {zombie_pids}")
        kill_processes(zombie_pids)
    else:
        print("🎉 No zombie processes found!")


In [2]:
import subprocess
import re
import os

# Set threshold in MiB (e.g. kill anything using more than 5000MiB)
THRESHOLD_MB = 5000

def get_gpu_processes():
    result = subprocess.run(
        ["nvidia-smi", "--query-compute-apps=pid,used_memory", "--format=csv,noheader,nounits"],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    if result.returncode != 0:
        print("❌ Error getting GPU process info")
        print(result.stderr)
        return []

    lines = result.stdout.strip().split('\n')
    processes = []
    for line in lines:
        try:
            pid_str, mem_str = line.strip().split(',')
            pid = int(pid_str)
            mem = int(mem_str)
            processes.append((pid, mem))
        except ValueError:
            continue  # skip malformed lines
    return processes

def kill_heavy_gpu_processes(threshold_mb=THRESHOLD_MB):
    processes = get_gpu_processes()
    killed = []
    for pid, mem in processes:
        if mem > threshold_mb:
            try:
                os.kill(pid, 9)
                print(f"💀 Killed PID {pid} using {mem} MiB")
                killed.append(pid)
            except Exception as e:
                print(f"⚠️ Failed to kill PID {pid}: {e}")
    if not killed:
        print("✅ No processes exceeded the threshold.")
    return killed

# 🧪 Run it
if __name__ == "__main__":
    kill_heavy_gpu_processes()


✅ No processes exceeded the threshold.


In [3]:
# !pip install pydub moviepy librosa
# !conda install -y -c conda-forge ffmpeg
# !pip install gTTS openai
# !pip install opencv-python
# !pip install torch
# !pip install git+https://github.com/openai/whisper.git
# !pip install boto3
# !pip install pathlib


!mkdir -p ~/.fonts
!mkdir -p ~/.local/share/fonts

!cp ../../src/fonts/nicomoji-plus_v2-5.ttf ~/.fonts/
!cp ../../src/fonts/Valty_DEMO.otf ~/.fonts/

!fc-cache -fv
!fc-list | grep "Valty\|nicomoji"



/usr/share/fonts: caching, new cache contents: 0 fonts, 2 dirs
/usr/share/fonts/X11: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/X11/encodings: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/X11/encodings/large: caching, new cache contents: 0 fonts, 0 dirs
/usr/share/fonts/X11/misc: 

caching, new cache contents: 89 fonts, 0 dirs
/usr/share/fonts/X11/util: caching, new cache contents: 0 fonts, 0 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype/dejavu: 

caching, new cache contents: 6 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/home/ubuntu/.local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/home/ubuntu/.fonts: caching, new cache contents: 2 fonts, 0 dirs
/usr/share/fonts/X11: skipping, looped directory detected
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/X11/encodings: skipping, looped directory detected
/usr/share/fonts/X11/misc: skipping, looped directory detected
/usr/share/fonts/X11/util: skipping, looped directory detected
/usr/share/fonts/truetype/dejavu: skipping, looped directory detected
/usr/share/fonts/X11/encodings/large: skipping, looped directory detected
/var/cache/fontconfig: not cleaning unwritable cache directory
/home/ubuntu/.cache/fontconfig: cleaning cache directory
/home/ubuntu/.fontconfig: not cleaning non-existent cache directory


fc-cache: succeeded


/home/ubuntu/.fonts/Valty_DEMO.otf: Valty DEMO:style=Bold Italic
/home/ubuntu/.fonts/nicomoji-plus_v2-5.ttf: NicoMoji+v2,ニコモジ＋v2:style=Regular


In [4]:
import whisper
import torch

torch.cuda.empty_cache()  # Clears unused GPU memory
torch.cuda.memory_summary(device=None, abbreviated=False)  # Show memory usage

# 🎤 Load the "large" Whisper model
model = whisper.load_model("large")

from pathlib import Path

# 🎧 音声ファイルを拡張子順に探索
for ext in [".mp3", ".m4a", ".opus"]:
    candidate = Path(f"music{ext}")
    if candidate.exists():
        audio_file = str(candidate)
        break
else:
    raise FileNotFoundError("🎧 Audio file not found!")

# 🧠 Whisperで文字起こし！
result = model.transcribe(
    audio_file,
    temperature=0,
    condition_on_previous_text=True,
    best_of=10,
    word_timestamps=True,
    beam_size=10,
    language="en"
)


# 📝 Extract transcribed segments
segments = result["segments"]

# Function to format timestamps for ASS subtitles
def ass_timestamp(seconds):
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    ms = int((seconds - int(seconds)) * 100)
    return f"{h}:{m:02}:{s:02}.{ms:02}"

buffer = 0.3
# 💾 Save transcriptions with timestamps
with open("split_lyrics.txt", "w", encoding="utf-8") as f:
    for seg in segments:
        start = seg['start']
        end = seg['end'] + buffer  # 終了時間に余裕を持たせる
        f.write(f"[{start:.2f}s - {end:.2f}s] {seg['text']}\n")

# 🎤 Print formatted lyrics
print("\n🎵 Improved Lyrics:\n ---")
for seg in segments:
    print(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['text']}")

print("\n✅ Split lyrics with timestamps saved in split_lyrics.txt!")



🎵 Improved Lyrics:
 ---
[17.06s - 24.76s]  You got mad on your face, you big disgrace, kicking your can all over the place, singing
[24.76s - 28.42s]  We will, we will rock you
[30.56s - 34.38s]  We will, we will rock you
[36.18s - 41.94s]  But you're a young man, hot man, shouting in the street, gonna take on the world someday
[41.94s - 47.94s]  You got blood on your face, you big disgrace, waving your banner all over the place
[47.94s - 52.24s]  We will, we will rock you
[54.16s - 58.08s]  We will, we will rock you
[59.88s - 65.50s]  But you're an old man, poor man, pleading with your eyes, gonna make you some beef someday
[65.50s - 71.54s]  You got mud on your face, big disgrace, somebody better put your bag into your place
[71.54s - 75.72s]  We will, we will rock you
[75.72s - 81.82s]  We will, we will rock you
[83.48s - 87.50s]  We will, we will rock you
[88.42s - 93.32s]  We will, we will rock you
[93.32s - 95.32s]  Alright
[118.42s - 120.50s]  We will, we will rock you
[120.50s

In [5]:
import openai
import time
import json

# 🔍 Whisperで取得した歌詞データを読み込み
with open("split_lyrics.txt", "r", encoding="utf-8") as f:
    lyrics_with_timestamps = f.read()

client = openai.OpenAI(api_key="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")

max_retries = 3
delay = 10  # 秒

for attempt in range(1, max_retries + 1):
    try:
        print(f"🌀 GPTにリクエスト中...（試行 {attempt} 回目）")
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a lyrics correction assistant."},
                {"role": "user", "content": f"""The following is a transcribed song lyrics with timestamps. 
However, there are some misrecognized words and phrases. 

Your task:
1. **Correct any transcription errors** and **ensure proper grammar and structure**.
2. **Remove unnecessary filler words** like 'uh', 'yeah', 'oh' (if they are not part of the lyrics).
3. **Keep timestamps in their original format**.
4. **Return the corrected lyrics with timestamps in the same format** as provided.
5. **Skip the duplicate phrases**

Here is the transcribed lyrics:
---
{lyrics_with_timestamps}
---
Now, please return the **corrected version**."""}
            ]
        )

        gpt_response = response.choices[0].message.content.strip()

        if not gpt_response:
            raise ValueError("Empty response from GPT-4")

        # 💾 修正済み歌詞を保存
        with open("corrected_lyrics_with_timestamps.txt", "w", encoding="utf-8") as f:
            f.write(gpt_response)

        print("✅ 修正済みの歌詞を corrected_lyrics_with_timestamps.txt に保存したのだ！")
        break  # 成功したのでループ脱出！

    except Exception as e:
        print(f"⚠️ エラー発生（{type(e).__name__}）: {e}")
        if attempt == max_retries:
            raise RuntimeError("❌ GPT-4のリクエストがすべて失敗したのだ…") from e
        else:
            print(f"⏳ {delay}秒待って再試行するのだ…")
            time.sleep(delay)



🌀 GPTにリクエスト中...（試行 1 回目）


✅ 修正済みの歌詞を corrected_lyrics_with_timestamps.txt に保存したのだ！


In [6]:
import openai
import json
import time

client = openai.OpenAI(api_key="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")

max_retries = 5
delay = 10

# 🔍 修正済みの歌詞を読み込む
with open("corrected_lyrics_with_timestamps.txt", "r", encoding="utf-8") as f:
    lyrics_with_timestamps = f.read()


# === 1. GPTに1番の開始・終了を聞く ===
max_retries = 3
delay = 3  # 秒

for attempt in range(1, max_retries + 1):
    try:
        print(f"🎤 1番の時刻推定リクエスト（試行{attempt}）")
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a lyrics processing assistant."},
                {"role": "user", "content": f"""The following is a corrected song lyrics with timestamps:
---
{lyrics_with_timestamps}
---

Your task:
1. Identify where the **first verse starts and ends**.
2. The **end** timestamp should be **approximately halfway through the full lyrics**.
3. The **end** timestamp should be over 50 seconds. 
4. Provide the **exact timestamps** for the first verse.
5. Return the timestamps **ONLY** in the following JSON format, no any instructions :
{{
    "start_time": 12.50,
    "end_time": 80.30
}}
"""}
            ]
        )
        gpt_response = response.choices[0].message.content.strip()

        # 👇 空っぽならエラーとしてリトライへ
        if not gpt_response:
            raise ValueError("Empty response from GPT-4")

        first_verse_info = json.loads(gpt_response)
        print("✅ 1番タイムスタンプ取得成功なのだ！", first_verse_info)
        # 💾 1番の時間情報を保存
        with open("first_verse_timestamps.txt", "w", encoding="utf-8") as f:
            json.dump(first_verse_info, f, indent=4)
        
        print("✅ 1番の開始・終了時間を first_verse_timestamps.txt に保存したのだ！",first_verse_info)
        break  # 成功したらループを抜ける

    except Exception as e:
        print(f"⚠️ リトライ {attempt}/{max_retries} 回目でエラー: {type(e).__name__}: {e}")
        if attempt == max_retries:
            raise RuntimeError("❌ 1番のタイムスタンプ取得にすべて失敗したのだ…") from e
        time.sleep(delay)



        

# === 2. 1番だけを抽出して整形＆分割 ===
formatted_lyrics = None
for attempt in range(1, max_retries + 1):
    try:
        print(f"🎶 1番の歌詞整形リクエスト（試行{attempt}）")
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a lyrics processing assistant."},
                {"role": "user", "content": f"""
---
{lyrics_with_timestamps}
---

The first verse starts at {first_verse_info["start_time"]}s and ends at {first_verse_info["end_time"]}s.

Your next task:
1. Extract **only** the lyrics within these timestamps.
2. Format the lyrics neatly.
3. Split the lyrics into **natural phrases** with timestamps.
4. **Skip the phrases that almost the same to already existing phrases.
5. Without any slash "/" in the sentence.
Return your answer in JSON format like this without any instruction:
[
    {{"text": "Tonight I'm gonna have myself a real good time", "start": 12.50, "end": 15.80}},
    {{"text": "I feel alive", "start": 15.80, "end": 17.30}},
    ...
]
"""}
            ]
        )
        gpt_response = response.choices[0].message.content.strip()
        formatted_lyrics = json.loads(gpt_response)

        with open("formatted_lyrics.json", "w", encoding="utf-8") as f:
            json.dump(formatted_lyrics, f, indent=4)

        print("✅ 整形済み＆分割済みの歌詞を formatted_lyrics.json に保存したのだ！", formatted_lyrics)
        break
    except Exception as e:
        print(f"⚠️ エラー発生（{type(e).__name__}）: {e}")
        if attempt == max_retries:
            raise RuntimeError("❌ 歌詞整形＆分割リクエストがすべて失敗したのだ…") from e
        time.sleep(delay)


🎤 1番の時刻推定リクエスト（試行1）


✅ 1番タイムスタンプ取得成功なのだ！ {'start_time': 17.06, 'end_time': 58.38}
✅ 1番の開始・終了時間を first_verse_timestamps.txt に保存したのだ！ {'start_time': 17.06, 'end_time': 58.38}
🎶 1番の歌詞整形リクエスト（試行1）


✅ 整形済み＆分割済みの歌詞を formatted_lyrics.json に保存したのだ！ [{'text': 'You got mud on your face, you big disgrace, kicking your can all over the place, singing', 'start': 17.06, 'end': 25.06}, {'text': 'We will, we will rock you', 'start': 24.76, 'end': 28.72}, {'text': "You're a young man, hard man, shouting in the street, gonna take on the world someday", 'start': 36.18, 'end': 42.24}, {'text': 'You got blood on your face, you big disgrace, waving your banner all over the place', 'start': 41.94, 'end': 48.24}, {'text': 'We will, we will rock you', 'start': 47.94, 'end': 52.54}]


In [7]:
import json
from pydub import AudioSegment
from gtts import gTTS
import openai
import subprocess
import requests
import boto3

# ✅ OpenAI APIキー（環境変数に設定済みなら不要）
api_key = "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"
client = openai.OpenAI(api_key=api_key)

# ✅ ずんだもんの VOICEVOX 音声ID（ノーマル）
# ZUNDAMON_VOICE_ID = 3  
# VOICEVOX_URL = "http://127.0.0.1:50021"
# ✅ Your API Key
API_KEY = "u-A78-362898440"  # Replace with your actual API key
# ✅ VOICEVOX API URL
VOICEVOX_API_URL = "https://api.su-shiki.com/v2/voicevox/audio/"  # ✅ Fixed URL
# ✅ Zundamon Voice ID
ZUNDAMON_VOICE_ID = 3  # (ノーマル Zundamon)
polly = boto3.client("polly", region_name="us-east-1")  # リージョンはお好みで


# 🎵 1番のタイムスタンプを JSON から読み込む
with open("formatted_lyrics.json", "r", encoding="utf-8") as f:
    timestamps = json.load(f)  # JSONを辞書型に変換


# 🎵 元のMP3ファイルをロード
audio = AudioSegment.from_file(audio_file)

# 📂 リストを用意（英語音声・翻訳テキスト・日本語音声）
# 📂 各ファイルをリスト化（英語音声・翻訳テキスト・ずんだもん音声）
english_audio_files = []
english_audio_files_2 = []
translated_texts = []
zundamon_audio_files = []


# 🔪 音声をフレーズごとにカット＆翻訳＆日本語音声生成
for i, segment in enumerate(timestamps):
    start_ms = int(segment["start"] * 1000)  # 秒 → ミリ秒変換
    end_ms = int(segment["end"] * 1000)
    
    # print(f"Processing segment {i+1}: {segment['text']} ({start_ms}ms - {end_ms}ms)")
    
    # 🎶 フレーズをカット
    phrase_audio = audio[start_ms:end_ms]
    
    # 💾 英語の音声を保存
    eng_filename = f"phrase_{i+1}_eng.wav"
    phrase_audio.export(eng_filename, format="wav")
    english_audio_files.append(eng_filename)
    
    print(f"✅ {eng_filename} を保存しました！（{segment['text']}）")

    
    # 🌍 gTTS で教科書みたいな英語を生成
    # 💾 WAV に変換して保存
    eng_filename_2 = f"phrase_{i+1}_eng_2.wav"
    response = polly.synthesize_speech(
        Text=segment['text'],
        OutputFormat="mp3",
        VoiceId="Ruth",
        Engine="neural",
    )
    with open("temp.mp3", "wb") as f:
        f.write(response["AudioStream"].read())
    AudioSegment.from_file("temp.mp3").export(eng_filename_2, format="wav")  # WAV に変換
    english_audio_files_2.append(eng_filename_2)

    
    print(f"✅ {eng_filename_2} を保存しました！（{segment['text']}）")

    
    # 🌍 GPT-4 で和訳
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a professional English-Japanese translator. \
            Translate the given the snippets of English song lyrics into natural Japanese, \
            you should refer the full lyrics for accurate translation,\
            and make sure to add a cute ending such as 'のだ' or 'なのだ' at the end.But not'なのだよ'"},
            {"role": "user", "content": f"""The full lyrics is shown below
            ---
            {timestamps}
            ---
            Translate the following song lyrics to natural Japanese:\n\n{segment['text']}\n
"""}
            
        ]
    )
    translated_text = response.choices[0].message.content.strip()
    translated_texts.append(translated_text)

    print(f"📝 翻訳完了: {translated_text}")

    # 🎤 Generate Zundamon's voice using VOICEVOX Cloud API with speed control
    params = {
        "text": translated_text,
        "speaker": ZUNDAMON_VOICE_ID,
        "key": API_KEY,
        "speedScale": 1.3  # 🔥 Set desired speed (1.0 is normal speed, 1.3 is faster)
    }
    
    response = requests.get(VOICEVOX_API_URL, params=params)
    
    if response.status_code == 200:
        zunda_filename = f"phrase_{i+1}_zunda.wav"
        with open(zunda_filename, "wb") as f:
            f.write(response.content)
    
        zundamon_audio_files.append(zunda_filename)  # Add the generated file to the list
        print(f"🎙 {zunda_filename} saved! ({translated_text})")
    else:
        print(f"❌ Failed to generate Zundamon voice: {response.text}")




print("🎉 すべてのフレーズ処理が完了しました！！")

# 💾 すべてのデータを保存（後で合成に使う）
data = {
    "english_audio": english_audio_files,
    "english_audio_2": english_audio_files_2,
    "translated_texts": translated_texts,
    "japanese_audio": zundamon_audio_files
}

with open("processed_data.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("✅ データを processed_data.json に保存！")


✅ phrase_1_eng.wav を保存しました！（You got mud on your face, you big disgrace, kicking your can all over the place, singing）
✅ phrase_1_eng_2.wav を保存しました！（You got mud on your face, you big disgrace, kicking your can all over the place, singing）


📝 翻訳完了: 顔に泥を塗られて、大恥をかいてるさ。缶蹴りしてばかりで、あちこちに散らばって、歌っているんだ。


🎙 phrase_1_zunda.wav saved! (顔に泥を塗られて、大恥をかいてるさ。缶蹴りしてばかりで、あちこちに散らばって、歌っているんだ。)
✅ phrase_2_eng.wav を保存しました！（We will, we will rock you）
✅ phrase_2_eng_2.wav を保存しました！（We will, we will rock you）


📝 翻訳完了: 私たちは、私たちは、あなたを揺さぶるのだ


🎙 phrase_2_zunda.wav saved! (私たちは、私たちは、あなたを揺さぶるのだ)
✅ phrase_3_eng.wav を保存しました！（You're a young man, hard man, shouting in the street, gonna take on the world someday）
✅ phrase_3_eng_2.wav を保存しました！（You're a young man, hard man, shouting in the street, gonna take on the world someday）


📝 翻訳完了: 君は若者で、硬い男、通りで叫んでいる、いつかこの世界を手に入れるんだ


🎙 phrase_3_zunda.wav saved! (君は若者で、硬い男、通りで叫んでいる、いつかこの世界を手に入れるんだ)
✅ phrase_4_eng.wav を保存しました！（You got blood on your face, you big disgrace, waving your banner all over the place）
✅ phrase_4_eng_2.wav を保存しました！（You got blood on your face, you big disgrace, waving your banner all over the place）


📝 翻訳完了: 君の顔には血がついていて、大きな恥さらし、君の旗をあちこちで振り回しているのだ。


🎙 phrase_4_zunda.wav saved! (君の顔には血がついていて、大きな恥さらし、君の旗をあちこちで振り回しているのだ。)
✅ phrase_5_eng.wav を保存しました！（We will, we will rock you）
✅ phrase_5_eng_2.wav を保存しました！（We will, we will rock you）


📝 翻訳完了: 私たちは、私たちは、あなたを揺さぶるのだ


🎙 phrase_5_zunda.wav saved! (私たちは、私たちは、あなたを揺さぶるのだ)
🎉 すべてのフレーズ処理が完了しました！！
✅ データを processed_data.json に保存！


In [8]:
import json
from pydub import AudioSegment

# 📂 ファイル設定
original_audio_file = "music.mp3"  # 元音源
timestamps_file = "first_verse_timestamps.txt"  # 1番の開始・終了時間が入ったJSON
output_audio_file = "first_verse.wav"  # 1番だけの音源

# 🎵 音声を読み込み
original_audio = AudioSegment.from_file(original_audio_file)

# 🕒 **JSONをパースして終了時間を取得**
with open(timestamps_file, "r", encoding="utf-8") as f:
    timestamps = json.load(f)  # JSONを辞書として読み込む
    end_time = float(timestamps["end_time"]) * 1000  # 秒 → ミリ秒変換

# 🎶 **1番だけを切り出し**（開始時間は 0 秒固定）
first_verse_audio = original_audio[:int(end_time+3000)]

# 🎚️ **フェードアウトを適用**
first_verse_audio = first_verse_audio.fade_out(3000)  # 3秒フェードアウト

# 💾 **保存**
first_verse_audio.export(output_audio_file, format="wav")
print(f"✅ {output_audio_file} を保存したのだ！")


✅ first_verse.wav を保存したのだ！


In [9]:
from pydub import AudioSegment
import json

# 📂 ファイル設定
original_audio_file = "first_verse.wav"  # 1番だけの元音源
chime_file = "../../src/sounds/Bell_Accent06-1(Dry).mp3"  # 注意書きの前のチャイム音
first_verse_file = "first_verse.wav"
processed_data_file = "processed_data.json"
formatted_lyrics_file = "formatted_lyrics.json"

output_audio_file = "final_output.mp3"  # 最終的な合成音声
output_json_file = "final_output.json"  # 動画用のJSONデータ

json_data = []
# 🔄 **データをロード**
with open(processed_data_file, "r", encoding="utf-8") as f:
    processed_data = json.load(f)

with open(formatted_lyrics_file, "r", encoding="utf-8") as f:
    lyrics_data = json.load(f)

# 🎵 **1番の音源をロード**
first_verse = AudioSegment.from_file(first_verse_file, format="mp3")

import os

if not os.path.exists(chime_file):
    raise FileNotFoundError(f"File not found: {chime_file}")

# 🔔 **チャイムと間（数秒）**
chime = AudioSegment.from_file(chime_file, format="mp3")
silence = AudioSegment.silent(duration=500)  # 1秒の無音
silence_2 = AudioSegment.silent(duration=3000)  # 1秒の無音

# 🎶 **音声結合開始！**
final_audio = chime + silence + silence + silence # 最初にチャイムと無音
# 📜 **JSONにデータを追加**
json_data.append({
    "index": 0,
    "original_text": "chime",
    "translated_text": "chime",
    "end_time": len(final_audio) / 1000,  # 秒に変換
    "type":"bridge",
    
})

final_audio +=  silence # 最初にチャイムと無音
# 📜 **JSONにデータを追加**
json_data.append({
    "index": 1,
    "original_text": "silence",
    "translated_text": "silence",
    "end_time": len(final_audio) / 1000,  # 秒に変換
    "type":"bridge",
    
})

final_audio += first_verse + silence  # 1番
# 📜 **JSONにデータを追加**
json_data.append({
    "index": 2,
    "original_text": "first_verse",
    "translated_text": "first_verse",
    "end_time": len(final_audio) / 1000,  # 秒に変換
    "type":"bridge",
    
})

final_audio +=  silence + chime + silence # 1番
# 📜 **JSONにデータを追加**
json_data.append({
    "index": 3,
    "original_text": "chime",
    "translated_text": "chime",
    "end_time": len(final_audio) / 1000,  # 秒に変換
    "type":"bridge",
    
})

# 🔄 **各フレーズごとに処理**
for i, (eng_audio_file,eng_audio_file_2, jap_text) in enumerate(zip(processed_data["english_audio"],processed_data["english_audio_2"], processed_data["translated_texts"])):
    # 🎵 **元音声**
    eng_audio = AudioSegment.from_file(eng_audio_file, format="wav")
    eng_audio = eng_audio.fade_in(100).fade_out(100)
    eng_audio_2 = AudioSegment.from_file(eng_audio_file_2, format="wav")
    
    # 🎙 **ずんだもん音声**
    # print(i)
    wav_audio_file = f"phrase_{i+1}_zunda.wav"
        
    # 変換した WAV を読み込む
    jap_audio = AudioSegment.from_wav(wav_audio_file)

    # 🔗 **元音声 → ずんだもん → 元音声**
    final_audio += eng_audio 

    # 📜 **JSONにデータを追加**
    json_data.append({
        "index": i + 4,
        "original_text": lyrics_data[i]["text"],
        "translated_text": "!!!!!! NONE !!!!!!!",
        "end_time": len(final_audio) / 1000,  # 秒に変換
        "type":"original",
    })

    final_audio +=  eng_audio_2 

    # 📜 **JSONにデータを追加**
    json_data.append({
        "index": i + 4,
        "original_text": lyrics_data[i]["text"],
        "translated_text": "!!!!!! NONE2 !!!!!!!",
        "end_time": len(final_audio) / 1000,  # 秒に変換
        "type":"English",
        
    })

    final_audio += jap_audio  + eng_audio + chime 

    # 📜 **JSONにデータを追加**
    json_data.append({
        "index": i + 4,
        "original_text": lyrics_data[i]["text"],
        "translated_text": jap_text,
        "end_time": len(final_audio) / 1000,  # 秒に変換
        "type":"Japanese",
        
    })

# 🔚 **最後にもう一度 1番の元音源（フェードアウト）**
final_audio += silence_2  
json_data.append({
    "index": i+5,
    "original_text": "chime_2",
    "translated_text": "chime_verse_2",
    "end_time": len(final_audio) / 1000,  # 秒に変換
    "type":"bridge",
    
})
final_audio +=  first_verse.fade_out(3000)
json_data.append({
    "index": i+6,
    "original_text": "first_verse_2",
    "translated_text": "first_verse_2",
    "end_time": len(final_audio) / 1000,  # 秒に変換
    "type":"bridge",
    
})


# 💾 **音声ファイルを保存**
final_audio.export("final_output.mp3", format="mp3")
print("✅ final_output.mp3 を作成したのだ！！🔥")

# 📜 **JSONデータを保存**
with open("final_output.json", "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=4)
print("✅ final_output.json も作成したのだ！🎉")

✅ final_output.mp3 を作成したのだ！！🔥
✅ final_output.json も作成したのだ！🎉


In [7]:
import json
import openai
import requests
import re

# Set your OpenAI API key
openai.api_key = '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'


# Read and clean lyrics from .txt file
with open('corrected_lyrics_with_timestamps.txt', 'r', encoding='utf-8') as file:
    lyrics_lines = [re.sub(r"\[.*?\]\s*", "", line).strip() for line in file]
lyrics_data = " ".join(lyrics_lines)

# Descriptive prompt for YouTube aspect ratio (16:9)
image_prompt = (
    f"Create an abstract, vibrant, artistic landscape-oriented (16:9 aspect ratio) image inspired by these song lyrics: {lyrics_data}. "
    "The visuals should be vivid, energetic, expressive, joyful, uplifting, and ideal for a music visualization background or YouTube video."
)

# Generate image with DALL-E at landscape aspect ratio
response = openai.images.generate(
    model="dall-e-3",
    prompt=image_prompt,
    size="1792x1024",  # YouTube-friendly 16:9 size
    quality="hd",
    n=1,
)

# Get image URL
image_url = response.data[0].url
print(f"🎨 Image URL: {image_url}")

# Download and save the image
img_response = requests.get(image_url)
with open('song_background_16_9.jpg', 'wb') as image_file:
    image_file.write(img_response.content)

print("✅ Image saved as 'song_background_16_9.jpg'")

import cv2

# Load the generated image
img = cv2.imread('song_background_16_9.jpg')

# Apply a gentle Gaussian blur (increase kernel size for stronger blur)
# blurred_img = cv2.GaussianBlur(img, (25, 25), 0)

# # Darken the image significantly (adjust the alpha lower for darker)
# darker_img = cv2.convertScaleAbs(blurred_img, alpha=0.4, beta=-30)

# Resize to exactly854x480 (YouTube 16:9)
resized_img = cv2.resize(img, (854, 480), interpolation=cv2.INTER_LANCZOS4)

cv2.imwrite('song_background_diluted.jpg', resized_img)

print("✅ Diluted image saved as 'song_background_diluted.jpg'")



🎨 Image URL: https://oaidalleapiprodscus.blob.core.windows.net/private/org-mrMPnnyPnpeaeyIDHJktEGA4/user-iKLyH1mSYw9lswsnJFYrPPNQ/img-TQ9SlwSAMSVAqyygQJR6J5BK.png?st=2025-04-14T20%3A02%3A20Z&se=2025-04-14T22%3A02%3A20Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=d505667d-d6c1-4a0a-bac7-5c84a87759f8&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2025-04-14T09%3A35%3A55Z&ske=2025-04-15T09%3A35%3A55Z&sks=b&skv=2024-08-04&sig=bgrqHmoigAZigpw2QEA3K5OoFJKC1nXcsW5Zwnkuslk%3D
✅ Image saved as 'song_background_16_9.jpg'
✅ Diluted image saved as 'song_background_diluted.jpg'


✅ Image saved as 'song_background_16_9.png'
✅ Diluted image saved as 'song_background_diluted.png'


In [8]:
import json
import openai
import requests
import re

# Set your OpenAI API key
openai.api_key = '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'


# Read and clean lyrics from .txt file
with open('corrected_lyrics_with_timestamps.txt', 'r', encoding='utf-8') as file:
    lyrics_lines = [re.sub(r"\[.*?\]\s*", "", line).strip() for line in file]
lyrics_data = " ".join(lyrics_lines)

# Descriptive prompt for YouTube aspect ratio (16:9)
image_prompt = (
    f"Create an abstract, vibrant, artistic landscape-oriented (16:9 aspect ratio) image inspired by these song lyrics: {lyrics_data}. "
    "The visuals should be vivid, energetic, expressive, joyful, uplifting, and ideal for a music visualization background or YouTube video."
)

# Generate image with DALL-E at landscape aspect ratio
response = openai.images.generate(
    model="dall-e-3",
    prompt=image_prompt,
    size="1792x1024",  # YouTube-friendly 16:9 size
    quality="hd",
    n=1,
)

# Get image URL
image_url = response.data[0].url
print(f"🎨 Image URL: {image_url}")

# Download and save the image
img_response = requests.get(image_url)
with open('song_background_16_9_2.jpg', 'wb') as image_file:
    image_file.write(img_response.content)

print("✅ Image saved as 'song_background_16_9_2.jpg'")

import cv2

# Load the generated image
img = cv2.imread('song_background_16_9_2.jpg')

# Apply a gentle Gaussian blur (increase kernel size for stronger blur)
# blurred_img = cv2.GaussianBlur(img, (25, 25), 0)

# # Darken the image significantly (adjust the alpha lower for darker)
# darker_img = cv2.convertScaleAbs(blurred_img, alpha=0.4, beta=-30)

# Resize to exactly 854x480 (YouTube 16:9)
resized_img = cv2.resize(img, (854, 480), interpolation=cv2.INTER_LANCZOS4)

cv2.imwrite('song_background_diluted_2.jpg', resized_img)

print("✅ Diluted image saved as 'song_background_diluted.jpg'")



🎨 Image URL: https://oaidalleapiprodscus.blob.core.windows.net/private/org-mrMPnnyPnpeaeyIDHJktEGA4/user-iKLyH1mSYw9lswsnJFYrPPNQ/img-VXBwUKlv1z9cvRsHR3jQ6HSx.png?st=2025-04-14T20%3A02%3A44Z&se=2025-04-14T22%3A02%3A44Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=d505667d-d6c1-4a0a-bac7-5c84a87759f8&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2025-04-14T13%3A43%3A14Z&ske=2025-04-15T13%3A43%3A14Z&sks=b&skv=2024-08-04&sig=yv4QpXIV1OAsdiPY%2BYcb3qj4PtsYYyuvkzbapYgDCOM%3D
✅ Image saved as 'song_background_16_9_2.jpg'
✅ Diluted image saved as 'song_background_diluted.jpg'


✅ Image saved as 'song_background_16_9_2.png'
✅ Diluted image saved as 'song_background_diluted.png'


In [9]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
from scipy.interpolate import make_interp_spline
from matplotlib.colors import PowerNorm
from matplotlib.collections import LineCollection
import subprocess


os.makedirs("frames", exist_ok=True)

# Parameters
total_frames = 300
fps = 60


# Load your processed darkened background image
background_img = plt.imread('song_background_diluted.jpg')

num_histograms = 10
bins = 100
data_points = 5000
total_frames = 120
np.random.seed(42)

# Generate random histograms with first and last being identical
random_histograms = np.random.rand(num_histograms - 1, bins) * 5
random_histograms = np.vstack([random_histograms, random_histograms[0]])

histogram_times = np.linspace(0, total_frames, num_histograms)

def interpolate_histograms(random_histograms, frame, total_frames):
    interp_func = make_interp_spline(histogram_times, random_histograms, axis=0, k=3)
    return interp_func(frame)

# Prepare x-axis bins and smooth interpolation
x_bins = np.linspace(0, 40 * np.pi, bins)
x_smooth = np.linspace(x_bins.min(), x_bins.max(), data_points)

# Animation loop
for i in range(total_frames):
    fig, ax = plt.subplots(figsize=(16, 9), dpi=100)

    # Background image
    background_height = 5
    ax.imshow(background_img, extent=[x_smooth.min(), x_smooth.max(), 0, background_height], aspect='auto')

    # Interpolate histogram for current frame
    interpolated_hist = interpolate_histograms(random_histograms, i, total_frames)

    # Smooth interpolation along x-axis
    spline = make_interp_spline(x_bins, interpolated_hist, k=3)
    animated_y = spline(x_smooth)

    # Normalize waveform to exactly 70% of background height
    waveform_max_height = background_height * 0.7
    animated_y = animated_y / np.max(animated_y) * waveform_max_height

    # Apply steeper color gradient
    norm = PowerNorm(gamma=0.5, vmin=animated_y.min(), vmax=animated_y.max())
    points = np.array([x_smooth, animated_y]).T.reshape(-1, 1, 2)
    segments = np.concatenate([points[:-1], points[1:]], axis=1)

    lc = LineCollection(segments, cmap='viridis', norm=norm)
    lc.set_array(animated_y)
    lc.set_linewidth(2)
    ax.add_collection(lc)

    ax.fill_between(x_smooth, animated_y, 0, color='black', alpha=0.3)

    ax.set_xlim(x_smooth.min(), x_smooth.max())
    ax.set_ylim(0, background_height)
    ax.axis('off')

    frame_filename = f"frames/frame_{i:04d}.jpg"
    plt.savefig(frame_filename, bbox_inches='tight', pad_inches=0, transparent=True)
    plt.close()

print("✅ Seamless looping waveform animation frames created!")

# Generate video from frames
def frames_to_video(frames_folder, output_filename, fps=60):
    subprocess.run([
        "ffmpeg",
        "-y",
        "-framerate", str(fps),
        "-i", f"{frames_folder}/frame_%04d.jpg",
        "-vf", "scale=854:480:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black",
        "-c:v", "libx264",
        "-pix_fmt", "yuv420p",
        output_filename
    ], check=True)

# Loop video
def loop_video(input_video, output_video, num_loops):
    subprocess.run([
        "ffmpeg",
        "-y",
        "-stream_loop", str(num_loops - 1),
        "-i", input_video,
        "-c", "copy",
        output_video
    ], check=True)

# Example Usage:
if __name__ == "__main__":
    frames_folder = "frames"
    short_video = "histogram_loop.mp4"
    final_video = "looped_histogram_movie.mp4"
    num_loops = 5

    frames_to_video(frames_folder, short_video, fps=fps)
    loop_video(short_video, final_video, num_loops)

    print(f"🎬 Final chaotic histogram movie saved as '{final_video}'!")

✅ Seamless looping waveform animation frames created!


ffmpeg version 9c33b2f Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 9.3.0 (crosstool-NG 1.24.0.133_b0863d8_dirty)
  configuration: --prefix=/home/ubuntu/miniconda3/envs/whisper_env --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1627813612080/_build_env/bin/x86_64-conda-linux-gnu-cc --disable-doc --disable-openssl --enable-avresample --enable-gnutls --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-libx264 --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/home/conda/feedstock_root/build_artifacts/ffmpeg_1627813612080/_build_env/bin/pkg-config
  libavutil      56. 51.100 / 56. 51.100
  libavcodec     58. 91.100 / 58. 91.100
  libavformat    58. 45.100 / 58. 45.100
  libavdevice    58. 10.100 / 58. 10.100
  libavfilter     7. 85.100 /  7. 85.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  7.100 /  5.  7.100
  libswresample   3.

🎬 Final chaotic histogram movie saved as 'looped_histogram_movie.mp4'!


frame=  120 fps=0.0 q=-1.0 Lsize=     733kB time=00:00:01.95 bitrate=3077.3kbits/s speed=2.01x    
video:730kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.307691%
[libx264 @ 0x595dc3d91480] frame I:1     Avg QP:26.86  size: 89852
[libx264 @ 0x595dc3d91480] frame P:31    Avg QP:26.78  size: 14334
[libx264 @ 0x595dc3d91480] frame B:88    Avg QP:32.52  size:  2420
[libx264 @ 0x595dc3d91480] consecutive B-frames:  1.7%  1.7%  0.0% 96.7%
[libx264 @ 0x595dc3d91480] mb I  I16..4:  1.5% 42.8% 55.7%
[libx264 @ 0x595dc3d91480] mb P  I16..4:  0.1%  1.6%  1.6%  P16..4: 16.2% 14.6% 10.0%  0.0%  0.0%    skip:56.0%
[libx264 @ 0x595dc3d91480] mb B  I16..4:  0.0%  0.0%  0.0%  B16..8: 12.9%  6.6%  2.6%  direct: 3.1%  skip:74.7%  L0:40.1% L1:49.7% BI:10.3%
[libx264 @ 0x595dc3d91480] 8x8 transform intra:46.3% inter:48.6%
[libx264 @ 0x595dc3d91480] coded y,uvDC,uvAC intra: 93.0% 99.8% 98.5% inter: 8.2% 14.8% 12.1%
[libx264 @ 0x595dc3d91480] i16 v,h,dc,p: 36% 14%  7% 43%
[

frame=  120 fps=0.0 q=-1.0 Lsize=     557kB time=00:00:01.95 bitrate=2338.8kbits/s speed=2.08x    
video:555kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.403826%
[libx264 @ 0x5f36ecb30f80] frame I:1     Avg QP:25.76  size: 47512
[libx264 @ 0x5f36ecb30f80] frame P:30    Avg QP:26.50  size: 10919
[libx264 @ 0x5f36ecb30f80] frame B:89    Avg QP:32.05  size:  2158
[libx264 @ 0x5f36ecb30f80] consecutive B-frames:  0.8%  0.0%  2.5% 96.7%
[libx264 @ 0x5f36ecb30f80] mb I  I16..4:  2.3% 61.1% 36.6%
[libx264 @ 0x5f36ecb30f80] mb P  I16..4:  0.1%  4.0%  2.7%  P16..4: 14.5% 14.9%  8.4%  0.0%  0.0%    skip:55.3%
[libx264 @ 0x5f36ecb30f80] mb B  I16..4:  0.0%  0.0%  0.1%  B16..8: 15.7%  7.9%  2.1%  direct: 2.0%  skip:72.3%  L0:41.6% L1:49.0% BI: 9.4%
[libx264 @ 0x5f36ecb30f80] 8x8 transform intra:58.7% inter:61.4%
[libx264 @ 0x5f36ecb30f80] coded y,uvDC,uvAC intra: 90.1% 99.4% 90.3% inter: 7.2% 12.8% 8.8%
[libx264 @ 0x5f36ecb30f80] i16 v,h,dc,p: 24% 10%  2% 64%
[l

In [12]:
import openai
import requests
import json
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont

# ✅ Set API Keys
OPENAI_API_KEY = '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'
PEXELS_API_KEY = "u8kqiLXhR9cXf5h8kWMDDmjEbfQ4AevXSvD60H6lPH47jdMRMluxPcCF"

# ✅ Initialize APIs
openai.api_key = OPENAI_API_KEY
PEXELS_API_URL = "https://api.pexels.com/v1/search"

# ✅ Step 1: Extract Keywords Using GPT
def extract_keywords_gpt(text_file, num_keywords=4):
    """Uses OpenAI GPT to extract relevant keywords (animals, tools, food, objects)."""
    
    with open(text_file, "r", encoding="utf-8") as f:
        text = f.read()

    prompt = f"""
    Extract {num_keywords} important keywords from the following text. 
    
    Text: {text}
    
    Return the result as a comma-separated list.
    """

    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )

    keywords = response.choices[0].message.content.strip().split(", ")
    
    return keywords

# ✅ Step 2: Fetch Images from Pexels API
def fetch_image(keyword):
    """Fetch an image URL from Pexels based on the given keyword."""
    headers = {"Authorization": PEXELS_API_KEY}
    params = {"query": keyword, "per_page": 1}

    response = requests.get(PEXELS_API_URL, headers=headers, params=params)

    if response.status_code == 200 and response.json()["photos"]:
        return response.json()["photos"][0]["src"]["large"]
    else:
        print(f"❌ No image found for: {keyword}")
        return None

# ✅ Step 3: Generate Images with Text Overlay
def create_image_with_text(image_url, text, output_filename):
    """Creates an image with a text overlay using a downloaded image or a blank background."""
    if image_url:
        response = requests.get(image_url)
        bg = Image.open(BytesIO(response.content)).convert("RGBA")
    else:
        bg = Image.new("RGBA", (854, 480), (255, 255, 255, 255))  # White background
    
    bg = bg.resize((854, 480))
    draw = ImageDraw.Draw(bg)

    font_path = "/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf"  # Ensure this font exists
    font_large = ImageFont.truetype(font_path, 60)

    # # Centered text
    # draw.text((640, 360), text, font=font_large, fill="black", anchor="mm")
    bg = bg.convert("RGB") 
    bg.save(output_filename)
    print(f"✅ Image saved as {output_filename}")

# ✅ Step 4: Run the Full Pipeline
keywords = extract_keywords_gpt("corrected_lyrics_with_timestamps.txt")
print(f"🎯 GPT-Extracted Keywords: {keywords}")

for i, keyword in enumerate(keywords):
    image_url = fetch_image(keyword)
    create_image_with_text(image_url, f"Keyword: {keyword}", f"output_{i+1}.jpg")

print("🎉 All images generated!")


🎯 GPT-Extracted Keywords: ['mud on your face', 'big disgrace', 'we will rock you', 'take on the world someday']
✅ Image saved as output_1.jpg
✅ Image saved as output_2.jpg
✅ Image saved as output_3.jpg
✅ Image saved as output_4.jpg
🎉 All images generated!


✅ Image saved as output_1.png


✅ Image saved as output_2.png


✅ Image saved as output_3.png


✅ Image saved as output_4.png
🎉 All images generated!


In [39]:
import random
from PIL import Image, ImageDraw, ImageFont

# ✅ Generate a Random Highlight Color
highlight_colors = [
    (255, 0, 0),  # Red
    (0, 158, 0),  # Green
    (0, 0, 255),  # Blue
    (255, 165, 0),  # Orange
    (128, 0, 128)  # Purple
]
selected_color = random.choice(highlight_colors)

# ✅ Function to Add a Properly Centered Text Box
def add_styled_textbox(image_path, text_segments, output_path):
    """Adds a semi-transparent text box with centered, bold, and colored text at the center of an image."""
    
    # Open Image
    img = Image.open(image_path).convert("RGBA")
    draw = ImageDraw.Draw(img)

    # ✅ Define Box Colors
    base_color = selected_color
    fill_color = (255, 255, 255, 180)  # More White + Transparency
    border_color = base_color + (255,)  # Solid Border

    # ✅ Load Font
    font_path = "../../src/fonts/nicomoji-plus_v2-5.ttf"  # Ensure the font file exists
    font_size = 20
    font_size_0 = 40
    font = ImageFont.truetype(font_path, font_size)
    font_0 = ImageFont.truetype(font_path, font_size_0)

    # ✅ Calculate Text Size for Box
    text_lines = [segment["text"] for segment in text_segments]
    text_widths = [draw.textbbox((0, 0), line, font=font)[2] for line in text_lines]
    text_heights = [draw.textbbox((0, 0), line, font=font)[3] for line in text_lines]
    
    max_text_width = max(text_widths)  # Longest line width
    total_text_height = sum(text_heights) + len(text_lines) * 10  # Space between lines

    # ✅ Define Box Size & Position (Centered)
    box_padding = 50
    box_padding_h = 15
    box_width = max_text_width + box_padding * 2
    box_height = total_text_height + box_padding_h * 2
    box_x = (img.width - box_width) / 2
    box_y = (img.height - box_height) / 2

    # ✅ Create Transparent Overlay
    overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
    overlay_draw = ImageDraw.Draw(overlay)

    # ✅ Draw Semi-Transparent Box
    overlay_draw.rectangle(
        [box_x, box_y, box_x + box_width, box_y + box_height],
        fill=fill_color, outline=border_color, width=6
    )

    # ✅ Merge Overlay with Image
    img = Image.alpha_composite(img, overlay)

    # ✅ Draw Centered Text Inside the Box
    draw = ImageDraw.Draw(img)
    text_x = box_x + box_padding
    text_y = box_y + box_padding_h + 70
    text_x_0 = box_width / 2 + 40
    text_y_0 = box_y +20

    draw.text((text_x_0, text_y_0), text_segments[0]["text"], font=font_0, fill=text_segments[1]["color"])        
    text_y += font_size

    #この動画はうp主の英語勉強用なのだ
    draw.text((text_x, text_y), text_segments[1]["text"], font=font, fill=text_segments[1]["color"])        
    text_y += font_size + 10
    #誰もが知る音楽を聞きながら、
    draw.text((text_x, text_y), text_segments[2]["text"], font=font, fill=text_segments[2]["color"])        
    text_y += font_size + 10
    #英語を学べるように工夫をしてみたのだ
    draw.text((text_x, text_y), text_segments[3]["text"], font=font, fill=text_segments[3]["color"])        
    text_y += font_size + 30
    #
    #試験的にPythonで全自動で生成
    draw.text((text_x, text_y), text_segments[4]["text"], font=font, fill=text_segments[4]["color"])        
    text_widths = draw.textbbox((0, 0), text_segments[4]["text"], font=font)[2]
    text_x += text_widths
    #されているのだ。
    draw.text((text_x, text_y), text_segments[5]["text"], font=font, fill=text_segments[5]["color"])        
    text_x = box_x + box_padding
    text_y += font_size + 10
    #不自然な点や間違っている点があったら教えてほしいのだ
    draw.text((text_x, text_y), text_segments[6]["text"], font=font, fill=text_segments[6]["color"])        
    text_y += font_size + 30

    #音源は
    draw.text((text_x, text_y), text_segments[7]["text"], font=font, fill=text_segments[7]["color"])        
    text_widths = draw.textbbox((0, 0), text_segments[7]["text"], font=font)[2]
    text_x += text_widths
    #著作権フリー
    draw.text((text_x, text_y), text_segments[8]["text"], font=font, fill=text_segments[8]["color"])        
    text_widths = draw.textbbox((0, 0), text_segments[8]["text"], font=font)[2]
    text_x +=  text_widths
    #のものを用いているのだ。
    draw.text((text_x, text_y), text_segments[9]["text"], font=font, fill=text_segments[9]["color"])        
    text_x = box_x + box_padding
    text_y += font_size + 10
    #気になる方はブラウザバックが推奨されるのだ。
    draw.text((text_x, text_y), text_segments[10]["text"], font=font, fill=text_segments[10]["color"])        
    text_y += font_size + 30


    #以上を踏まえて、
    draw.text((text_x, text_y), text_segments[11]["text"], font=font, fill=text_segments[11]["color"])        
    text_widths = draw.textbbox((0, 0), text_segments[12]["text"], font=font)[2]
    text_x += text_widths
    #一緒に楽しみながら
    draw.text((text_x, text_y), text_segments[12]["text"], font=font, fill=text_segments[12]["color"])        
    text_widths = draw.textbbox((0, 0), text_segments[12]["text"], font=font)[2]
    text_x += text_widths
    #勉強しようなのだ！
    draw.text((text_x, text_y), text_segments[13]["text"], font=font, fill=text_segments[13]["color"])        

    # ✅ Save the Modified Image
    img.convert("RGB").save(output_path)
    print(f"✅ Styled text box added to {output_path}")

    
# ✅ Example Usage
image_path = "output_1.jpg"  # Replace with your actual image
output_path = "output_text_1.jpg"

# ✅ Define Styled Text Segments
text_segments = [
    {"text": "~おことわり~ ", "color": "black"},
    {"text": "この動画はうp主の英語勉強用なのだ。 ", "color": "black"},
    {"text": "誰もが知る洋楽を聞きながら、", "color": selected_color},
    {"text": "英語を学べるように工夫をしてみたのだ。", "color": "black"},

    {"text": "試験的に機械学習を使い全自動で動画を生成", "color": selected_color},
    {"text": "しているので、", "color": "black"},
    {"text": "不自然な点や間違っている点があれば教えてほしいのだ。", "color": "black"},
    
    {"text": "音源は、", "color": "black"},
    {"text": "著作権フリー", "color": selected_color},
    {"text": "のものを用いているのだ。", "color": "black"},
    {"text": "気になる方はブラウザバックが推奨されるのだ。", "color": "black"},
    
    {"text": "以上を踏まえて、", "color": "black"},
    {"text": "一緒に楽しみながら", "color": selected_color},
    {"text": "勉強しようなのだ！", "color": "black"},
]


# ✅ Run the Function
add_styled_textbox(image_path, text_segments, output_path)


# ✅ Function to Add a Properly Centered & Left-Aligned Text Box
def add_styled_textbox(image_path, text_segments, output_path):
    """Adds a semi-transparent text box with left-aligned text except for the title, which is centered."""

    # Open Image
    img = Image.open(image_path).convert("RGBA")
    draw = ImageDraw.Draw(img)

    # ✅ Define Box Colors
    fill_color = (255, 255, 255, 200)  # More White + Semi-Transparency
    border_color = selected_color + (255,)  # Solid Border

    # ✅ Load Font
    font_path = "../../src/fonts/nicomoji-plus_v2-5.ttf"  # Ensure the font file exists
    font_size = 70
    font = ImageFont.truetype(font_path, font_size)
    

    # ✅ Calculate Text Size for Box
    text_lines = [segment["text"] for segment in text_segments]
    text_widths = [draw.textbbox((0, 0), line, font=font)[2] for line in text_lines]
    text_heights = [draw.textbbox((0, 0), line, font=font)[3] for line in text_lines]

    max_text_width = max(text_widths)  # Longest line width
    total_text_height = sum(text_heights) + len(text_lines) * 5  # **Reduced Line Spacing**

    # ✅ Define Box Size & Position (Centered Box)
    box_padding = 25
    box_width = max_text_width + box_padding * 2
    box_height = total_text_height + box_padding * 2
    box_x = (img.width - box_width) / 2
    box_y = (img.height - box_height) / 2

    # ✅ Create Transparent Overlay
    overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
    overlay_draw = ImageDraw.Draw(overlay)

    # ✅ Draw Semi-Transparent Box
    overlay_draw.rectangle(
        [box_x, box_y, box_x + box_width, box_y + box_height],
        fill=fill_color, outline=border_color, width=6
    )

    # ✅ Merge Overlay with Image
    img = Image.alpha_composite(img, overlay)

    # ✅ Draw Text
    draw = ImageDraw.Draw(img)
    text_x = box_x + box_padding  # Left-align except for the first line
    text_y = box_y + box_padding

    for i, segment in enumerate(text_segments):
        if i == 0:  # Center-align the title
            text_w = draw.textbbox((0, 0), segment["text"], font=font)[2]
            centered_x = box_x + (box_width - text_w) / 2
            draw.text((centered_x, text_y), segment["text"], font=font, fill=segment["color"])
        else:  # Left-align the rest
            draw.text((text_x, text_y), segment["text"], font=font, fill=segment["color"])

        text_y += font_size + 5  # Reduced line spacing

    # ✅ Save the Modified Image
    img.convert("RGB").save(output_path)
    print(f"✅ Styled text box added to {output_path}")

# ✅ Apply Text Boxes to Images
image_paths = ["output_2.jpg", "output_3.jpg", "output_4.jpg"]  # Replace with actual file names
texts = [{"text": "フレーズごとの翻訳なのだ！", "color": "black"} ,{"text": "改めて曲を聞いてみるのだ！", "color": "black"}  ,{"text": "ご視聴ありがとうなのだ！", "color": "black"}  ]  # Replace with actual extracted words
output_paths = [ "output_text_2.jpg", "output_text_3.jpg", "output_text_4.jpg"]

for img, txt, out in zip(image_paths, texts, output_paths):
    add_styled_textbox(img, [txt], out)  # Pass text as a list


print("🎉 All images processed successfully!")


✅ Styled text box added to output_text_1.jpg
✅ Styled text box added to output_text_2.jpg
✅ Styled text box added to output_text_3.jpg
✅ Styled text box added to output_text_4.jpg
🎉 All images processed successfully!


✅ Styled text box added to output_text_3.png
✅ Styled text box added to output_text_4.png
🎉 All images processed successfully!


In [30]:
import os
import random
import numpy as np
from PIL import Image

# ✅ Function to randomly select an image from a given folder
def get_random_image(folder_path):
    images = [f for f in os.listdir(folder_path) if f.endswith(".png")]
    if not images:
        raise FileNotFoundError(f"No PNG images found in {folder_path}")
    return os.path.join(folder_path, random.choice(images))

folders = [["../../src/zunda/No_mike/", "../../src/methane/No_mike/"],
                   ["../../src/zunda/No_mike/", "../../src/methane/No_mike/"],
                   ["../../src/zunda/Close_eyes/", "../../src/methane/Close_eyes/"],
                   ["../../src/zunda/mike/", "../../src/methane/mike/"],
                   ["../../src/zunda/No_mike/", "../../src/methane/No_mike/"],
                   ["../../src/zunda/Close_eyes/", "../../src/methane/Close_eyes/"],
                   ["../../src/zunda/No_mike/", "../../src/methane/No_mike/"]
                  ]

for i in np.arange(0,len(folders),1):
    # ✅ Define paths
    zundamon_folder = folders[i][0]
    methane_folder = folders[i][1]
    # ✅ Get random images
    zundamon_path = get_random_image(zundamon_folder)
    methane_path = get_random_image(methane_folder)
    
    # ✅ Load Images
    zundamon = Image.open(zundamon_path).convert("RGBA")
    methane = Image.open(methane_path).convert("RGBA")
    
    # ✅ Define Canvas Size (854x480 or match a base image)
    canvas_size = (854, 480)
    canvas = Image.new("RGBA", canvas_size, (255, 255, 255, 0))  # Transparent background
    
    # ✅ Resize Zundamon & Methane-chan (Optional)
    ratio = 0.15
    width = int(round(1082*ratio,0))
    height = int(round(1650*ratio,0))
    zundamon = zundamon.resize((width, height))  # Adjust if needed
    methane = methane.resize((width, height))
    
    # ✅ Define Positions
    zundamon_pos = (canvas_size[0] - zundamon.width - 20, canvas_size[1] - zundamon.height + 80)  # Bottom-right
    methane_pos = (20, canvas_size[1] - methane.height + 80)  # Bottom-left
    
    # ✅ Paste Images onto Canvas
    canvas.paste(zundamon, zundamon_pos, zundamon)  # Keep transparency
    canvas.paste(methane, methane_pos, methane)
    
    # ✅ Save the Final Image
    output_path = "zunda_methane_output_{}.jpg".format(i)
    canvas.save(output_path, format="PNG")
    
    print(f"✅ PNG with Zundamon & Methane-chan created: {output_path}")


✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_0.jpg
✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_1.jpg
✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_2.jpg
✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_3.jpg
✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_4.jpg
✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_5.jpg
✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_6.jpg


✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_2.png
✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_3.png


✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_4.png
✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_5.png


✅ PNG with Zundamon & Methane-chan created: zunda_methane_output_6.png


In [16]:
import json
import subprocess
import numpy as np
import re
import textwrap
import ast  # To safely convert string representation of a list into a real list


# Load JSON file
with open("final_output.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)
with open("formatted_lyrics.json", "r", encoding="utf-8") as f:
    first_json_data = json.load(f)
client = openai.OpenAI(api_key="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")



# Start ASS subtitle file
# Start ASS subtitle file
ass_lines = [
    "[Script Info]",
    "Title: Lyrics",
    "ScriptType: v4.00+",
    "",
    "[V4+ Styles]",
    "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, "
    "Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
    "Alignment, MarginL, MarginR, MarginV, Encoding",
    "Style: English_1,Valty DEMO,20,&H00FFFFFF,&H00000000,&H00000000,&H00000000,1,0,0,0,100,100,0,0,1,3,1,5,10,10,280,1",
    "Style: English_2,Valty DEMO,20,&H00FFFFFF,&H00000000,&H00FF66CC,&H00000000,1,0,0,0,100,100,0,0,1,3,1,5,10,10,280,1",
    "Style: Japanese,NicoMoji+v2,16,&H00FFFFFF,&H00000000,&H0066FF66,&H00000000,1,0,0,0,100,100,0,0,1,3,1,5,10,10,200,1",
    "",
    "[Events]",
    "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text"
]
# Function to convert seconds to ASS timestamp
def ass_timestamp(seconds):
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    ms = int((seconds - int(seconds)) * 100)  # FFmpeg uses centiseconds
    return f"{h:01}:{m:02}:{s:02}.{ms:02}"


def split_text_Eng(text, max_length_Eng=30):
    # 🇺🇸 English: Break at natural word boundaries (spaces)
    words = text.split()
    lines = []
    line = ""

    for word in words:
        if len(line) + len(word) + 1 <= max_length_Eng:
            line += " " + word if line else word
        else:
            lines.append(line)
            line = word
        
    if line:
        lines.append(line)

    return "\\N".join(lines)  # Use `\N` for ASS format


# def split_text_Jap(text,max_length_Jap=15):
#     response = client.chat.completions.create(
#         model="gpt-4",
#         messages=[
#             {"role": "system", "content": "You are a lyrics processing assistant."},
#             {"role": "user", "content": f"""The following list contains a corrected song lyrics in each row:
#     ---
#     {text}
#     ---
#     Your task:
#     - Connect sentences from each raws of the input list and make a plane text, and sentences from each raws are connected by "-" .
#     - Insert "+" between words in each sentences when each sentences has over about {max_length_Jap} characters to make smooth line break.
#     - Remove the quotation mark 
#     - Return the processed lyrics as **a text**.
#     """}
#         ]
#     )

#     # 📜 Extract GPT-4's response
#     gpt_response = response.choices[0].message.content.strip()
    
#     # Remove English letters, digits, and common symbols
#     clean_text = re.sub(r'[A-Za-z0-9\s!"#$%&\'()*,./:;<=>?@\[\\\]^_`{|}~]+', '',gpt_response)
#     # ✅ Convert into a clean list
#     lyrics_sections = clean_text.split("-")  # Separate by "-"
 
#     print("GPT-4 Raw Response:", lyrics_sections)  # Debugging Output
#     return lyrics_sections

def break_japanese_text(text, max_chars=18):
    return '\\N'.join([text[i:i+max_chars] for i in range(0, len(text), max_chars)])



# 📝 Process lyrics
subtitle_text_prep_Eng = []
subtitle_text_prep_Jap = []
subtitle_text_Eng = []
subtitle_text_Jap = []

for entry in json_data:
    if entry["type"] == "Japanese":
        subtitle_text_prep_Eng.append(entry["original_text"])
        subtitle_text_prep_Jap.append(entry["translated_text"])
# 🎶 Split text

# subtitle_text_Jap=split_text_Jap(subtitle_text_prep_Jap)

for i in np.arange(0,len(subtitle_text_prep_Eng),1):
    subtitle_text_Eng.append(split_text_Eng(subtitle_text_prep_Eng[i]))
    subtitle_text_Jap.append(break_japanese_text(subtitle_text_prep_Jap[i]))

# 🛠 Debugging Output
print("Processed Japanese Subtitles:", subtitle_text_Jap)
print("Processed English Subtitles:", subtitle_text_Eng)



silence_1_duration = json_data[1]["end_time"]  # Duration of Notion image before waveform starts
silence_3_duration = json_data[-2]["end_time"]  # Total duration of the final video
fade_duration = 500 

j=0
previous_time = 0  # Track previous subtitle's time
for entry in first_json_data:
    start_time = silence_1_duration + entry["start"]
    end_time = silence_1_duration + entry["end"]
    subtitle_text = subtitle_text_Eng[j]
    style = "English_1"
    ass_lines.append(
        f"Dialogue: 0,{ass_timestamp(start_time+0.01)},{ass_timestamp(end_time)},{style},,0,0,0,,{{\fad({fade_duration},{fade_duration})}}{subtitle_text}"
    )
    j += 1 
    
    # print("english:",entry["text"])
    
previous_time = 0  # Track previous subtitle's time
j = 0
# Convert each lyric entry to ASS format
for entry in json_data:
    if entry["type"] == "bridge":
        # print("skip:",entry["translated_text"])
        previous_entry = entry["end_time"]    
        continue  # Skip empty lines and chimes

    if entry["type"] == "original":
        start_time = previous_entry
        end_time = entry["end_time"]
        subtitle_text = subtitle_text_Eng[j]
        style = "English_1"
        # print("english:",entry["original_text"])
        ass_lines.append(
        f"Dialogue: 0,{ass_timestamp(start_time)},{ass_timestamp(end_time)},{style},,0,0,0,,{{\fad({fade_duration},{fade_duration})}}{subtitle_text}"
        )


    if entry["type"] == "English":
        start_time = previous_entry
        end_time = entry["end_time"]
        subtitle_text = subtitle_text_Eng[j]
        style = "English_2"
        # print("english:",entry["original_text"])
        ass_lines.append(
        f"Dialogue: 0,{ass_timestamp(start_time)},{ass_timestamp(end_time)},{style},,0,0,0,,{{\fad({fade_duration},{fade_duration})}}{subtitle_text}"
        )

    
    if entry["type"] == "Japanese":
        start_time = previous_entry 
        end_time = entry["end_time"]
        subtitle_text_1 =  subtitle_text_Eng[j]
        subtitle_text_2 = subtitle_text_Jap[j].replace("+", r"\N")
        style_1 = "English_2"
        style_2 = "Japanese"
        # print("Japanese:",entry["translated_text"])
        ass_lines.append(
        f"Dialogue: 0,{ass_timestamp(start_time)},{ass_timestamp(end_time)},{style_1},,0,0,0,,{{\fad({fade_duration},{fade_duration})}}{subtitle_text_1}"
        )
        ass_lines.append(
        f"Dialogue: 0,{ass_timestamp(start_time)},{ass_timestamp(end_time)},{style_2},,0,0,0,,{{\fad({fade_duration},{fade_duration})}}{subtitle_text_2}"
        )
        j += 1 

    previous_entry = entry["end_time"]


j=0
previous_time = 0  # Track previous subtitle's time
for entry in first_json_data:
    start_time = silence_3_duration + entry["start"]
    end_time = silence_3_duration + entry["end"]
    subtitle_text = subtitle_text_Eng[j]
    style = "English_1"
    ass_lines.append(
        f"Dialogue: 0,{ass_timestamp(start_time+0.01)},{ass_timestamp(end_time)},{style},,0,0,0,,{{\fad({fade_duration},{fade_duration})}}{subtitle_text}"
    )
    j += 1 


# Save as an ASS file
with open("lyrics.ass", "w", encoding="utf-8") as f:
    f.write("\n".join(ass_lines))

print("✅ Successfully created subtitles: lyrics.ass")

Processed Japanese Subtitles: ['顔に泥を塗られて、大恥をかいてるさ。\\N缶蹴りしてばかりで、あちこちに散らば\\Nって、歌っているんだ。', '私たちは、私たちは、あなたを揺さぶる\\Nのだ', '君は若者で、硬い男、通りで叫んでいる\\N、いつかこの世界を手に入れるんだ', '君の顔には血がついていて、大きな恥さ\\Nらし、君の旗をあちこちで振り回してい\\Nるのだ。', '私たちは、私たちは、あなたを揺さぶる\\Nのだ']
Processed English Subtitles: ['You got mud on your face, you\\Nbig disgrace, kicking your can\\Nall over the place, singing', 'We will, we will rock you', "You're a young man, hard man,\\Nshouting in the street, gonna\\Ntake on the world someday", 'You got blood on your face,\\Nyou big disgrace, waving your\\Nbanner all over the place', 'We will, we will rock you']
✅ Successfully created subtitles: lyrics.ass


In [1]:
#### import json
import subprocess
import numpy as np
import os
from pydub import AudioSegment
import json

# **既存の `final_video.mp4` を削除**
if os.path.exists("final_video.mp4"):
    os.remove("final_video.mp4")
    print("🗑️ Removed existing final_video.mp4")

with open("final_output.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)
with open("processed_data.json", "r", encoding="utf-8") as f:
    processed_data = json.load(f)




chime_file = "../../src/sounds/Bell_Accent06-1(Dry).mp3"  # 注意書きの前のチャイム音
# 🔔 **チャイムと間（数秒）**
chime = AudioSegment.from_file(chime_file, format="mp3")
chime_time = len(chime) / 1000
silence = AudioSegment.silent(duration=1000)
silence_time = len(silence)/1000


notion_duration = json_data[1]["end_time"]  # Duration of Notion image before waveform starts
silence_1_duration = json_data[1]["end_time"]  # Duration of Notion image before waveform starts
video_1_duration = json_data[2]["end_time"]  # Duration of Notion image before waveform starts
silence_2_duration = json_data[3]["end_time"]  # Duration of Notion image before waveform starts
trans_i_duration = []
methane_duration = []
zunda_duration = []
for i in np.arange(0,len(processed_data["english_audio"]),1):
    trans_i_duration.append(json_data[4+3*i]["end_time"])
    methane_duration.append(json_data[4+3*i+1]["end_time"])
    zunda_duration.append(json_data[4+3*i+2]["end_time"])
silence_3_duration = json_data[-2]["end_time"]  # Total duration of the final video
video_2_duration = json_data[-1]["end_time"]  # Duration of Notion image before waveform starts
fade_duration = 0.5  # フェードイン・アウトの長さ（秒）
black_duration = 1
desired_duration_song_bg = silence_3_duration- silence_2_duration
desired_duration_trans = silence_2_duration - video_1_duration
desired_duration_second = silence_3_duration - trans_i_duration[-1]
last_fadeout_duration = 3

# FFmpeg command with improved logic
ffmpeg_cmd = [
    "/usr/bin/ffmpeg",
    # "-hwaccel", "cuda",  # 🚀 Enable GPU Acceleration
    # "-hwaccel_output_format", "cuda",

    # **入力ファイル**
    "-loop", "1", "-t", str(notion_duration), "-i", "output_text_1.jpg",
    "-stream_loop", "-1", "-i", "looped_histogram_movie.mp4",
    "-i", "../../src/frames/outframe.mp4",  # **フレーム動画 (wave の上に重ねる)**
    # Input images with looping and duration setting
    "-loop", "1", "-t", str(desired_duration_song_bg), "-i", "song_background_diluted.jpg",
    "-loop", "1", "-t", str(desired_duration_trans), "-i", "output_text_2.jpg",
    "-loop", "1", "-t", str(desired_duration_second), "-i", "output_text_3.jpg",
    "-loop", "1", "-t", str(last_fadeout_duration), "-i", "output_text_4.jpg",
    "-loop", "1", "-t", str(video_1_duration), "-i", "zunda_methane_output_0.jpg",
    "-loop", "1", "-t", str(video_1_duration), "-i", "zunda_methane_output_1.jpg",
    "-loop", "1", "-t", str(video_1_duration), "-i", "zunda_methane_output_2.jpg",
    "-loop", "1", "-t", str(video_1_duration), "-i", "zunda_methane_output_3.jpg",
    "-loop", "1", "-t", str(video_1_duration), "-i", "zunda_methane_output_4.jpg",
    "-loop", "1", "-t", str(video_1_duration), "-i", "zunda_methane_output_5.jpg",
    "-loop", "1", "-t", str(video_1_duration), "-i", "zunda_methane_output_6.jpg",
    "-loop", "1", "-t", str(desired_duration_song_bg), "-i", "song_background_diluted_2.jpg",
    "-i", "final_output.mp3",  # **音声**
    
    # **フィルタの適用**
    "-filter_complex",
    ##**Notion画像をフェードイン・フェードアウト**
    "color=c=black:s=854x480:d=" + str(silence_1_duration) +"[black]; "   
    # 
    "[0:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration="
    + str(silence_1_duration-( black_duration - fade_duration ))+
    ",fade=t=in:st=0:d=" + str(fade_duration)+ 
    ",fade=t=out:st=" + str(silence_1_duration -black_duration) + 
    ":d=" + str(fade_duration)  + "[notion]; "  
    "[black][notion]overlay[out_1_0];"
    # "[black][notion]overlay=enable='gte(t,"+str( black_duration - fade_duration )+")'[out_1];"
    "[7:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration="
    + str(silence_1_duration-( black_duration - fade_duration ))+
    ",fade=t=in:st=0:d=" + str(fade_duration)+ 
    ",fade=t=out:st=" + str(silence_1_duration -black_duration) + 
    ":d=" + str(fade_duration)  + "[zunda_0]; "  
    "[out_1_0][zunda_0]overlay[out_1];"
    
    # **波形動画をフェードイン・フェードアウト**
    "color=c=black:s=854x480:d="+ str(video_1_duration -silence_1_duration) +"[black]; "   
    # 
    "[1:v]split=2[first_corse_1][first_corse_2];"
    "[first_corse_1]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration=" 
    + str(video_1_duration - silence_1_duration- (black_duration - fade_duration)) +
    ",fade=t=in:st=0:d=" + str(fade_duration) + 
    ",fade=t=out:st=" + str(video_1_duration - silence_1_duration -black_duration) + 
    ":d=" + str(fade_duration) + "[wave]; "
    "[black][wave]overlay[out_2_0];"
    
    "[8:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration="
    + str(video_1_duration - silence_1_duration- (black_duration - fade_duration)) +
    ",fade=t=in:st=0:d=" + str(fade_duration) + 
    ",fade=t=out:st=" + str(video_1_duration - silence_1_duration -black_duration) +  
    ":d=" + str(fade_duration)  + "[zunda_1]; "  
    "[out_2_0][zunda_1]overlay[out_2];"

    # "[black][wave]overlay=enable='gte(t,"+str(black_duration-fade_duration)+")'[out_2];"
    # # **フレーム動画 (`outframe.mp4`) を `wave` の上に重ねる**
    # "[2:v]scale=854:-1,pad=854:480:(ow-iw)/2:(oh-ih)/2,setsar=1[frame]; "  # **サイズ調整**
    # "[wave][frame]overlay=format=auto:eof_action=pass[wave_frame]; "  # **waveの上にフレームを重ねる**
    # **trans画像をフェードイン・フェードアウト**
    "color=c=black:s=854x480:d="+ str(silence_2_duration - video_1_duration) +"[black]; "   
    #     
    "[4:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration=" 
    + str(silence_2_duration - video_1_duration - (black_duration - fade_duration))+
    ",fade=t=in:st=0:d=" + str(fade_duration) + 
    ",fade=t=out:st=" + str(silence_2_duration - video_1_duration - black_duration ) + 
    ":d=" + str(fade_duration) + "[trans]; "  
    "[black][trans]overlay[out_3_0];"
    # "[black][trans]overlay=enable='gte(t,"+str( black_duration - fade_duration )+")':format=auto[out_3];"
    "[9:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration="
     + str(silence_2_duration - video_1_duration - (black_duration - fade_duration))+
    ",fade=t=in:st=0:d=" + str(fade_duration) + 
    ",fade=t=out:st=" + str(silence_2_duration - video_1_duration - black_duration ) + 
    ":d=" + str(fade_duration)  + "[zunda_2]; "  
    "[out_3_0][zunda_2]overlay[out_3];"
    
    # **半透明の黒長方形 (fade適用なし)**
    "color=c=black:s=854x480:d="+ str(zunda_duration[-1]- silence_2_duration) +"[black]; "   
    # 
    "color=c=black@0.6:s=1100x600:d=" + str(zunda_duration[-1]- silence_2_duration - (black_duration -fade_duration)) + "[overlay_black_raw]; "
    # **フェードイン・アウトを適用**
    "[overlay_black_raw]fade=t=in:st=0:d=" + str(fade_duration) + ","
    "fade=t=out:st=" + str(zunda_duration[-1]- silence_2_duration -black_duration) + ":d=" + str(fade_duration) + "[overlay_black]; "
    "[overlay_black]drawbox=x=0:y=0:w=iw:h=ih:t=10:color=orange@0.9[rounded_black]; "
    # "[trans_bg][rounded_black]overlay=(W-w)/2:(H-h)/2:format=auto:eof_action=pass[trans_frame_black]; "
    # **
    "[14:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration=" 
    + str(zunda_duration[-1]- silence_2_duration - (black_duration - fade_duration))+
    ",fade=t=in:st=0:d=" + str(fade_duration) +
    ",fade=t=out:st=" + str(zunda_duration[-1]- silence_2_duration -black_duration ) + 
    ":d=" + str(fade_duration) + "[trans_bg]; "
    "[trans_bg][rounded_black]overlay=(W-w)/2:(H-h)/2[trans_pic_bg];"
    # "[trans_bg][trans_frame_black]overlay=(W-w)/2:(H-h)/2[trans_pic_bg];"
    # **
    "[black][trans_pic_bg]overlay[out_4_0];"
    # "[black][trans_pic_bg]overlay=enable='gte(t,"+str(black_duration - fade_duration)+")'[out_4];"
    # "[black][trans]overlay=enable='gte(t,"+str( black_duration - fade_duration )+")':format=auto[out_3];"
    "[10:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration="
    + str(zunda_duration[-1]- silence_2_duration - (black_duration - fade_duration))+
    ",fade=t=in:st=0:d=" + str(fade_duration) +
    ",fade=t=out:st=" + str(zunda_duration[-1]- silence_2_duration -black_duration ) + 
    ":d=" + str(fade_duration)  + "[zunda_3]; "  
    "[out_4_0][zunda_3]overlay[out_4];"
    
    # **trans画像をフェードイン・フェードアウト**
    "color=c=black:s=854x480:d="+ str(silence_3_duration - zunda_duration[-1]) +"[black]; "   
    # 
    "[5:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration=" 
    + str(silence_3_duration - zunda_duration[-1] - (black_duration - fade_duration))+
    ",fade=t=in:st=0:d=" + str(fade_duration) + 
    ",fade=t=out:st=" + str(silence_3_duration - zunda_duration[-1] - black_duration) +
    ":d=" + str(fade_duration) + "[second]; "  
    "[black][second]overlay[out_5_0];"
    # "[black][second]overlay=enable='gte(t,"+str( black_duration - fade_duration )+")':format=auto[out_5];"
    
    "[11:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration="
    + str(silence_3_duration - zunda_duration[-1] - (black_duration - fade_duration))+
    ",fade=t=in:st=0:d=" + str(fade_duration) + 
    ",fade=t=out:st=" + str(silence_3_duration - zunda_duration[-1] - black_duration) +
    ":d=" + str(fade_duration)  + "[zunda_4]; "  
    "[out_5_0][zunda_4]overlay[out_5];"
    
    # **波形動画をフェードイン・フェードアウト**
    "color=c=black:s=854x480:d="+ str(video_1_duration -silence_1_duration-fade_duration - last_fadeout_duration) +"[black]; "   
    # 
    "[first_corse_2]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration=" 
    + str(video_1_duration - silence_1_duration- (black_duration - fade_duration)-last_fadeout_duration ) +
    ",fade=t=in:st=0:d=" + str(fade_duration) + 
    ",fade=t=out:st=" + str(video_1_duration - silence_1_duration -black_duration-last_fadeout_duration) + 
    ":d=" + str(fade_duration) + "[wave]; "
    "[black][wave]overlay[out_6_0];"

    "[12:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration="
    + str(video_1_duration - silence_1_duration- (black_duration - fade_duration) -last_fadeout_duration) +
    ",fade=t=in:st=0:d=" + str(fade_duration) +  
    ",fade=t=out:st=" + str(video_1_duration - silence_1_duration -black_duration-last_fadeout_duration) + 
    ":d=" + str(fade_duration) + "[zunda_6]; "  
    "[out_6_0][zunda_6]overlay[out_6];"
    

    # **trans画像をフェードイン・フェードアウト**
    "color=c=black:s=854x480:d="+ str(last_fadeout_duration) +"[black]; " 
    # 
    "[6:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration="
    + str(last_fadeout_duration - (black_duration - fade_duration))+
    ",fade=t=in:st=0:d=" + str(fade_duration)  +"[end]; "  
    "[black][end]overlay[out_7_0];"
    # "[black][second]overlay=enable='gte(t,"+str( black_duration - fade_duration )+")':format=auto[out_5];"
    "[13:v]scale=854:-1:force_original_aspect_ratio=decrease,pad=854:480:(ow-iw)/2:(oh-ih)/2:black,setsar=1,trim=start=0:duration="
    + str(last_fadeout_duration - (black_duration - fade_duration))+
    ",fade=t=in:st=0:d=" + str(fade_duration) + "[zunda_7]; "  
    "[out_7_0][zunda_7]overlay[out_7];"
    
    # # **黒背景の上に Notion画像をオーバーレイ*
    "[out_1][out_2][out_3][out_4][out_5]concat=n=5:v=1:a=0[first];"
    "[first][out_6][out_7]concat=n=3:v=1:a=0[fullvideo];"

    
    # **字幕適用**
    "[fullvideo]ass=lyrics.ass[outv]",  
    # "[fullvideo]null[outv]",
    
    # # **出力設定**
    # "-shortest",
    # "-map", "[outv]", "-map", "15:a:0",  
    # "-c:v", "h264_nvenc",
    # "-pix_fmt", "yuv420p",
    # "-c:a", "aac",
    # "-b:a", "192k",
    # "-af", "volume=1.0",  # Ensure audio isn't muted
    # "final_video.mp4"
    # **出力設定**
    "-shortest",
    "-r", "30",  # 👈 フレームレート下げる
    "-map", "[outv]", "-map", "15:a:0",
    "-c:v", "libx264",
    "-preset", "fast",  # 👈 高速プリセット追加
    "-pix_fmt", "yuv420p",
    "-c:a", "aac",
    "-b:a", "192k",
    "-af", "volume=1.0",
    "final_video.mp4"
]


ffmpeg_cmd.extend(["-loglevel", "verbose"])

# Run FFmpeg command in Python
try:
    process = subprocess.run(ffmpeg_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print("✅ Video processing complete: final_video.mp4")
except subprocess.CalledProcessError as e:
    print("❌ FFmpeg Error:")
    print(e.stderr.decode())  # Show FFmpeg error details


✅ Video processing complete: final_video.mp4


In [18]:
# from IPython.display import Video

# video_path = "final_video.mp4"  # Ensure this path is correct
# Video(video_path, embed=True)