<a href="https://colab.research.google.com/github/Kai3150/D/blob/master/mojiokoshi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q git+https://github.com/openai/whisper.git  # Whisper本体をインストール
!pip install -q ffmpeg-python  # ffmpeg連携用


In [3]:
from google.colab import files
uploaded = files.upload()  # 選択したファイルがアップロードされる


In [None]:
import whisper

# Whisperのモデルを読み込み (例: "base"モデル)
model = whisper.load_model("base")

# アップロードしたファイル名を取得
file_name = list(uploaded.keys())[0]  # 先ほどfiles.upload()したファイル

# 音声を文字起こし
# language="ja" を指定すると日本語として推論が走りやすくなります（autoでもOK）。
result = model.transcribe(file_name, language="ja")

# 結果を表示
print("=== Whisper Transcription Result ===")
print(result["text"])


In [18]:
################## Youtubeから動画を取得してモデルを学習させる ##################
!pip install -q yt_dlp webvtt-py pydub
!pip install -q fsspec==2024.12.0
!pip install -q --upgrade --no-deps gcsfs
!pip install -q datasets transformers

In [5]:
#######################################################
#### 1. YouTubeから音声＆字幕を取得
#######################################################

import yt_dlp

ydl_opts = {
    'format': 'bestaudio/best',
    'outtmpl': 'data/%(id)s.%(ext)s',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
        'preferredquality': '192',
    }],
    'writesubtitles': True,
    'writeautomaticsub': True,
    'subtitleslangs': ['ja'],
}

video_urls = [
    "https://www.youtube.com/watch?v=Bty3cU04_9s",  # ゴールドマンサックスの動画 3min
    "https://www.youtube.com/watch?v=f6zV71sD220",  # DBJの就職活動する人の動画 13min
    # 必要な動画URLを追加
]

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download(video_urls)


[youtube] Extracting URL: https://www.youtube.com/watch?v=Bty3cU04_9s
[youtube] Bty3cU04_9s: Downloading webpage
[youtube] Bty3cU04_9s: Downloading tv client config
[youtube] Bty3cU04_9s: Downloading player 643afba4
[youtube] Bty3cU04_9s: Downloading tv player API JSON
[youtube] Bty3cU04_9s: Downloading ios player API JSON
[youtube] Bty3cU04_9s: Downloading m3u8 information
[info] Bty3cU04_9s: Downloading subtitles: ja
[info] Bty3cU04_9s: Downloading 1 format(s): 251
[info] Writing video subtitles to: data/Bty3cU04_9s.ja.vtt
[download] Destination: data/Bty3cU04_9s.ja.vtt
[download] 100% of   14.85KiB in 00:00:00 at 75.63KiB/s
[download] Destination: data/Bty3cU04_9s.webm
[download] 100% of    1.99MiB in 00:00:00 at 4.66MiB/s   
[ExtractAudio] Destination: data/Bty3cU04_9s.wav
Deleting original file data/Bty3cU04_9s.webm (pass -k to keep)
[youtube] Extracting URL: https://www.youtube.com/watch?v=f6zV71sD220
[youtube] f6zV71sD220: Downloading webpage
[youtube] f6zV71sD220: Downloading t

In [9]:
#######################################################
#### 2. 字幕ファイルをパースしてタイムスタンプとテキストを抽出
#######################################################

import webvtt
import csv
import os

def time_to_seconds(t):
    h, m, s = t.split(":")
    s, ms = s.split(".")
    return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000

def parse_vtt(vtt_path):
    segments = []
    for caption in webvtt.read(vtt_path):
        segments.append({
            "start": time_to_seconds(caption.start),
            "end": time_to_seconds(caption.end),
            "text": caption.text.replace("\n", " ")
        })
    return segments

output_csv = "dataset.csv"
data_dir = "data/"

with open(output_csv, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["video_id", "audio_file", "start", "end", "text"])
    for file in os.listdir(data_dir):
        if file.endswith(".vtt"):
            video_id = file.split(".")[0]
            vtt_path = os.path.join(data_dir, file)
            segments = parse_vtt(vtt_path)
            audio_file = os.path.join(data_dir, f"{video_id}.wav")
            for seg in segments:
                writer.writerow([video_id, audio_file, seg["start"], seg["end"], seg["text"]])


In [12]:
#######################################################
#### 3. 字幕タイムスタンプに沿って音声セグメントを作成
#######################################################

from pydub import AudioSegment

input_csv = "dataset.csv"
output_dir = "segments/"
os.makedirs(output_dir, exist_ok=True)

with open(input_csv, "r", encoding="utf-8") as f_in, open("training_data.csv", "w", newline="", encoding="utf-8") as f_out:
    reader = csv.DictReader(f_in)
    writer = csv.writer(f_out)
    writer.writerow(["audio", "text"])
    for row in reader:
        audio_file = row["audio_file"]
        start_ms = float(row["start"]) * 1000
        end_ms = float(row["end"]) * 1000
        text = row["text"]
        audio = AudioSegment.from_wav(audio_file)
        segment = audio[start_ms:end_ms]
        seg_filename = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(audio_file))[0]}_{int(start_ms)}_{int(end_ms)}.wav")
        segment.export(seg_filename, format="wav")
        writer.writerow([seg_filename, text])


In [None]:
#######################################################
#### 4. Hugging Face Dataset を作成し Whisper を Fine-Tuning
#######################################################

import pandas as pd
from datasets import Dataset, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split

# CSVからデータフレーム作成（列：audio, text）
df = pd.read_csv("training_data.csv")
df = df.rename(columns={"text": "correct"})

# Datasetに変換し、Audio型にキャスト
dataset = Dataset.from_pandas(df)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# Processorの読み込み
processor = WhisperProcessor.from_pretrained("openai/whisper-large", language="ja", task="transcribe")

def prepare_dataset(batch):
    batch["input_features"] = processor.feature_extractor(batch["audio"]["array"],
                                                           sampling_rate=batch["audio"]["sampling_rate"]).input_features[0]
    batch["labels"] = processor.tokenizer(batch["correct"]).input_ids
    return batch

processed_dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
# Convert Hugging Face Dataset to pandas DataFrame
df = processed_dataset.to_pandas()

# Split the DataFrame
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the DataFrames back to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)
# モデルの読み込み
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")

# 学習パラメータの設定
training_args = Seq2SeqTrainingArguments(
    output_dir="./finetuned_whisper",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Pass the training dataset
    eval_dataset=eval_dataset,   # Pass the evaluation dataset
    tokenizer=processor.feature_extractor,
)

trainer.train()
trainer.save_model("./finetuned_whisper")


Map:   0%|          | 0/652 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

In [None]:
#######################################################
#### 5. Fine-Tuning済みモデルで推論
#######################################################

from transformers import AutoProcessor, WhisperForConditionalGeneration

processor = AutoProcessor.from_pretrained("openai/whisper-large", language="ja", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("./finetuned_whisper")

test_audio = "test/test_audio.wav"  # 推論対象の音声ファイルパス
inputs = processor(test_audio, return_tensors="pt")
generated_ids = model.generate(inputs.input_features)
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(transcription)


# 新しいセクション