In [None]:
from pydub import AudioSegment
from moviepy.editor import VideoFileClip
import time
import glob
import os
import re
from datetime import timedelta
import subprocess
from srt import Subtitle
import srt
import shutil
from IPython.display import clear_output
import pandas as pd
import whisper

In [None]:
# 必要なフォルダを作成

os.makedirs('input_video', exist_ok = True)                   # 翻訳字幕を付けたいmp4 videoを入れる その他は空でよい
os.makedirs('output_video_with_subtitle', exist_ok = True)
os.makedirs('input_audio', exist_ok = True)
os.makedirs('csv_files', exist_ok = True)
os.makedirs('excel_files_for_srt', exist_ok = True)
os.makedirs('excel_files_for_srt_ja', exist_ok = True)
os.makedirs('srt_files_ja', exist_ok = True)

In [None]:
wp_path = os.getcwd()
print(wp_path)

In [None]:
files = glob.glob('./input_video/*.mp4')
files = [file.replace('\\', '/') for file in files]
files

In [None]:
files[0]

In [None]:
# pathを渡すと動画名のみを返す関数

def extract_video_name(video_path):
    p = r'\/(.*)\.'
    video_name = re.findall(p, video_path)
    video_name = video_name[0].split('/')[-1]
    return video_name

In [None]:
# テスト

extract_video_name(files[0])

## mp4からmp3を取り出す

In [None]:
for file in files:
    input_path = file
    output_path_mother = './input_audio/'
    video_name = extract_video_name(file)
    
    # ビデオから音声ファイルを抽出しmp3で別のフォルダに保存
    video_clip = VideoFileClip(file)
    video_clip.audio.write_audiofile(output_path_mother + video_name + '.mp3')

## 文字起こし

In [None]:
audio_files = glob.glob('./input_audio/*.mp3')
audio_files = [file.replace('\\', '/') for file in audio_files]
audio_files

In [None]:
model = whisper.load_model('large')

In [None]:
for audio_file in audio_files:
    clear_output()
    
    # オーディオファイル情報の表示
    sound_input = AudioSegment.from_file(audio_file, format = "mp3")
    sound_duration = sound_input.duration_seconds
    print('audio duration = ', round(sound_duration / 60, 2), 'min\n')    

    # 文字起こし
    result  = model.transcribe(audio_file, verbose = True, language = "en")
    seginfo = result["segments"]
    out_text= []

    # segment情報から発言の開始/終了時間とテキストを抜き出し、srt形式で編集する
    for data in seginfo:
        start = data["start"]
        end   = data["end"]
        text  = data["text"]
        out_line = Subtitle(index = 1,
                    start = timedelta(seconds = timedelta(seconds = start).seconds,
                    microseconds = timedelta(seconds = start).microseconds),
                    end = timedelta(seconds = timedelta(seconds = end).seconds,
                    microseconds = timedelta(seconds = end).microseconds),
                    content = text,
                    proprietary = '')
        out_text.append(out_line)
        
        
    # srt形式のファイルをcsv形式に編集して保存する。
    audio_name = extract_video_name(audio_file)

    with open("csv_files/" + audio_name + ".csv", mode = "w", encoding = "utf-8_sig") as f:
        origin = srt.compose(out_text)
        origin = origin.replace(",", ".")
        origin = origin.replace("\n", ",")
        origin = origin.replace(",,", "\n")
        f.write(origin)

## csv_filesフィルダ内の全ての en csvファイルをExcelファイルに変換し、excel_file_for_srtフォルダに保存する

In [None]:
# csv ファイルのパスを取得

files = glob.glob('./csv_files/*.csv')
files = [file.replace('\\', '/') for file in files]
files

In [None]:
def extract_file_name(file_path):
    p = r'\/(.*)\.'
    file_name = re.findall(p, file_path)
    file_name = file_name[0].split('/')[-1]
    return file_name

In [None]:
# sanity check

extract_file_name(files[0])

In [None]:
# csvファイルを同名でexcelに変換し保存

for file in files:
    df = pd.read_csv(file, header = None)
    file_name = extract_file_name(file)
    df.to_excel('excel_files_for_srt/' + file_name + '.xlsx', index = False)

# <font color="red">SUSPEND!!! $\;$ excel_files_for_srtフォルダ内のすべてのExcelファイルを、google翻訳などで変換し、excel_files_for_srt_jaフォルダに入れてください。</font>

## excel_files_for_srt_ja内のすべての日本語excelファイルを、srtファイルに自動で変換し、srt_files_jaフォルダにコピーする。

In [None]:
# 日本語Excelファイルのパスを取得

files = glob.glob('./excel_files_for_srt_ja/*.xlsx')
files = [file.replace('\\', '/') for file in files]
files

In [None]:
# excelファイルをsrtファイルにしやすくする変換

def excel2pre_srt(df):
    
    df_process_0 = pd.DataFrame(df[0])
    df_process_0 = df_process_0.rename(columns = {0:'counter'})
    df_process_1 = df[1].str.split(' --> ', expand = True)
    df_process_1.columns = ['start','end']
    df_process_2 = df_process_1['start'].str.split('.', expand = True)
    df_process_2.columns = ['start','start_milli']
    df_process_3 = df_process_1['end'].str.split('.', expand = True)
    df_process_3.columns = ['end','end_milli']
    df_pre_srt = pd.concat([df_process_0, df_process_2, df_process_3, df_process[2]], axis = 1)
    df_pre_srt = df_pre_srt.rename(columns = {2:'text'})
    
    return df_pre_srt

In [None]:
for file in files:
    df_ja = pd.read_excel(file)
    df_process = df_ja.copy()
    
    df_pre_srt = excel2pre_srt(df_process)
    
    video_name = extract_video_name(file)

    with open("srt_files_ja/" + video_name + ".srt", mode = "w", encoding = "utf-8") as f:
        for row in range(df_pre_srt.shape[0]):

            counter     = df_pre_srt.loc[row, "counter"]
            start       = df_pre_srt.loc[row, "start"].strip()
            start_milli = df_pre_srt.loc[row, "start_milli"].strip()
            end         = df_pre_srt.loc[row, "end"].strip()
            end_milli   = df_pre_srt.loc[row, "end_milli"].strip()
            text        = df_pre_srt.loc[row, "text"].strip()

            print(counter, file = f)
            print(start + ',' + start_milli + ' --> ' + end + ',' + end_milli, file = f)
            print(text, file = f)
            print("", file = f)

## 字幕生成

In [None]:
# videoファイルのパスの所得

files = glob.glob('./input_video/*.mp4')
files = [file.replace('\\', '/') for file in files]
files

In [None]:
for file in files:
    input_path = file
    video_name = extract_video_name(file)
    print('processing ', video_name, '...')
    
    # assファイルの作成
    new_path = shutil.copy('./srt_files_ja/' + video_name + '.srt', 'temp.srt')
    cmd = "ffmpeg -i temp.srt temp.ass"
    res = subprocess.call(cmd, shell = True)
    
    # 字幕処理するファイルを作業場にコピー
    new_path = shutil.copy(file, 'temp_video.mp4')
    
    # 字幕付け
    command = "ffmpeg -i temp_video.mp4 -vf ass=temp.ass temp_video_sub.mp4"
    res = subprocess.call(command, shell = True)
    
    # Videoを保存
    os.rename('temp_video_sub.mp4', video_name + '_sub.mp4')
    new_path = shutil.move(video_name + '_sub.mp4', 'output_video_with_subtitle/' + video_name + '_sub.mp4')
    
    # 作業場のファイルを削除
    os.remove('temp_video.mp4')
    os.remove('temp.srt')
    os.remove('temp.ass')
    
    print('... ', video_name, ' --- Done\n')
    
print('ALL DONE!')

## 引き続き動画を圧縮する場合

In [None]:
# cmd_3 = 'ffmpeg -i Takataken_Kinugawa_sub.mp4 -crf 40 Takataken_Kinugawa_sub_small.mp4'
# subprocess.call(cmd_3, shell=True)

## movieをカットする

In [None]:
# # 必要に応じてmp4ファイルの情報を取得

# import sys
# import ffmpeg
# from pprint import pprint

# in_filename = sys.argv[1]
# probe = ffmpeg.probe('test.mp4')
# for stream in probe['streams']:
#     print('stream {0}: {1}'.format(stream['index'],stream['codec_type']))
#     pprint(stream)
#     print('')

In [None]:
from moviepy.editor import *
file_path = "test.mp4"

start = 9740          # in second
end = start + 2000

save_path = "output.mp4"

video = VideoFileClip(file_path).subclip(start, end) 
video.write_videofile(save_path,fps=30)