##install

In [None]:
#%pip install --upgrade pip --user
#%pip install moviepy==1.0.3
#%pip install opencv-python
#%pip install google-colab
#%pip install ffmpeg
#%pip install mecab-python3
#%pip install unidic-lite
#%pip install --upgrade google-cloud-speech
%run "/content/drive/MyDrive/Labo/STT.ipynb"



## import

In [None]:
#「動画、字幕の開始地点・終了地点・テキストを与えると字幕を付けてくれるシステム」作るぞ
import cv2
from google.colab.patches import cv2_imshow
import numpy as np

#progress barに必要
from tqdm.notebook import tqdm

#日本語の使用に必要
from PIL import Image, ImageDraw, ImageFont

#PIL型の画像を表示する
from IPython.display import display

#csv
import csv

from enum import Enum

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## クラス定義

### CapPosType, CapDispType

In [None]:
#字幕の表示位置
class CapPosType(Enum):
  #画面下部
  BOTTOM = 0
  #人物下部
  BOTTOM_PERSON = 1
  #画面外
  OUT = 2

#字幕の表示方法
class CapDispType(Enum):
  #逐次、一括、歌詞
  SEQ = 0
  ALL = 1
  LYRIC = 2

### Caption, Captions

In [None]:
class Caption:
  start = 0
  end = 0
  text = ""
  speaker = ""
  origin=(0,0)
  lefttop = (0,0)
  rightbottom = (0,0)
  speaker_id = 0

  def __init__(self, start, end, text, speaker, id, sp_posx=0, sp_posy=0):
    self.start = start
    self.end = end
    self.text = text
    self.speaker = speaker + "≫" if speaker != "" else "　"
    self.origin=(sp_posx, sp_posy)
    self.speaker_id = id
  def __str__(self):
    return f"start:{self.start}, end:{self.end}, text:{self.text}, speaker:{self.speaker}"
  def __repr__(self):
    return self.__str__()

class Captions:
  """
    字幕を定義するクラス
  """

  _captions = []
  _speakers = {}

  def __init__(self, fps, m):

    self._captions=[]
    self._speakers = {}

    with open(m.csv_path, 'r') as f:
      reader = csv.reader(f)
      lines = [row for row in reader]
      for line in lines:
        #最初の行は無視
        if line[0] == 'start_time':
          continue

        """#話者idを作る
        if line[3] not in self._speakers and line[3] != "":
          id = len(self._speakers)
          self._speakers[line[3]] = id
        if line[3] in self._speakers:
          id = self._speakers[line[3]]"""

        # ダミーテキストで字幕の位置を計算
        dummy_text = "漢あA　、^,g"
        left, top, right, bottom = m.font.getbbox(dummy_text)
        text_height = bottom - top

        buffer = m.padding[1] * 2
        text_height += buffer

        #表示位置の大元を決める
        origin_x=50
        origin_y=round(video.height) - text_height - m.padding[1]

        self._captions.append(
            Caption(int(float(line[0])*fps), int(float(line[1])*fps), line[2], line[3], int(line[4]), origin_x, origin_y)
            )

    #print(self._speakers)

  def getCaptions(self, frameid, wrapping):
    """
    フレームIDから字幕を取得する関数：該当なしならNoneを返す
    #文字列が長かったら折り返す
    """
    captions_in_frame=[]
    for c in self._captions:
      if c.start <= frameid and c.end >= frameid:
        captions_in_frame.append(c)
    if len(captions_in_frame) > 0:
      #print(str(c) for c in captions_in_frame)
      return captions_in_frame
    else:
      return None

  def __str__(self):
    return f"captions:{self._captions}"
  def __repr__(self):
    return self.__str__()

### VideoCapture

In [None]:
class VideoCapture:
  """
  動画を読み込むクラス
  """
  def __init__(self, m):
    self.path = m.video_path
    self.cap = cv2.VideoCapture(self.path)
    self.frame_count = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
    self.fps = self.cap.get(cv2.CAP_PROP_FPS)
    self.width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    if m.posType == CapPosType.OUT:
      self.height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)*1.3)
    else:
      self.height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

  def __str__(self):
    return f"path:{self.path}, frame_count:{self.frame_count}, fps:{self.fps}, width:{self.width}, height:{self.height}"
  def __repr__(self):
    return self.__str__()

## メソッド定義

### create_overlay(), draw_text_outline(), draw_text()

In [None]:
def create_overlay(img_size, origin, text_bbox, padding, text_bg):
  """
  字幕の背景となるオーバーレイを作成する関数
  """
  overlay = Image.new('RGBA', img_size, (0,0,0,0))
  draw = ImageDraw.Draw(overlay)
  left, top, right, bottom = text_bbox
  draw.rectangle(((origin[0]+left-padding[0], origin[1]+top-padding[1]),
                  (origin[0]+right+padding[0], origin[1]+bottom+padding[1])),
                  fill=tuple(text_bg))
  return overlay

def draw_text_outline(draw, origin, text, font, font_outline_color):
  """
  テキストの輪郭を描画する関数
  """
  draw.text((origin[0]+1, origin[1]), text, font=font, fill=tuple(font_outline_color))
  draw.text((origin[0]-1, origin[1]), text, font=font, fill=tuple(font_outline_color))
  draw.text((origin[0], origin[1]+1), text, font=font, fill=tuple(font_outline_color))
  draw.text((origin[0], origin[1]-1), text, font=font, fill=tuple(font_outline_color))

def draw_text(draw, origin, text, m, speaker_id):
  """
  テキストを描画する関数
  m.change_color次第で文字の色も変える
  """
  if not m.change_color:
    draw.text(origin, text, font=m.font, fill=tuple(m.font_colors[4]))
  else:
    draw.text(origin, text, font=m.font, fill=tuple(m.font_colors[speaker_id]))
  #draw.rectangle((origin[0],origin[1],origin[0]+3,origin[1]+3), fill=tuple(font_color))

### PutImage()


In [None]:
def PutImage(frame, bg_image):
  """
  フレームに背景画像を貼り付ける関数
  """
  img = Image.fromarray(frame)
  img.putalpha(255)
  bg_image.paste(img, (0, 0))

### PutText()

In [None]:
def PutText(m, video, this_cs, frame, bg_image):
  """
  フレームに字幕を付ける関数
  """

  # frameを背景画像に貼り付け
  img = Image.fromarray(frame)
  img.putalpha(255)
  bg_image.paste(img, (0, 0))

  offset = 0 # y座標のオフセット

  for i, c in reversed(list(enumerate(this_cs))):
    if m.disp_name:
      text = c.speaker + c.text
    else:
      text = c.text
    #print(text)

    left, top, right, bottom = m.font.getbbox(text)
    text_height = bottom - top

    #PosType別のテキストの表示位置を決める
    if m.posType == CapPosType.BOTTOM:
      origin = [c.origin[0], c.origin[1] - offset]
    elif m.posType == CapPosType.BOTTOM_PERSON:
      origin = [c.origin[0], c.origin[1] - offset]
    elif m.posType == CapPosType.OUT:
      origin = [c.origin[0], c.origin[1] - offset]

    overlay = create_overlay(bg_image.size, origin, (left, top, right, bottom), m.padding, m.text_bg)
    bg_image = Image.alpha_composite(bg_image, overlay)

    draw_text_outline(ImageDraw.Draw(bg_image), origin, text, m.font, m.font_outline_color)
    draw_text(ImageDraw.Draw(bg_image), origin, text, m, c.speaker_id)
    bg_image = Image.alpha_composite(bg_image, overlay)

    # 次の字幕のためのオフセットを計算
    offset += text_height + m.padding[1] * 2

  frame = np.array(bg_image) #BGRAになる
  frame = cv2.resize(frame, (video.width, video.height))
  frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR) #ここでBGRに戻る
  return frame

### GenerateCaption()

In [None]:
def GenerateCaption(m, video):
  """
  字幕を生成する関数
  """
  t = transcribe(local_video_path=m.video_path,
               bucket_name=m.gcs_bucket_name,
               gcs_key_path=m.gcs_key_path,
               wav_name="tmp.wav",
               csv_path=m.csv_path,
               disp_all=m.dispType==CapDispType.ALL)

  return t

## 字幕全体の設定 (Manager)

In [None]:
class Manager:
  def __init__(self):
    #入力欄
    #色はBGR
    self.font_colors=((0, 255, 255, 255)      #黄色
                      ,(0, 255, 0, 255)       #緑
                      ,(32, 128, 255, 255)    #青
                      ,(73, 38, 187, 255)     #赤マゼンタ
                      ,(255, 255, 255, 255))  #白
    self.font_outline_color=(0, 0, 0, 255)
    self.font_size=50
    self.bg=(0, 0, 0, 255)
    self.text_bg=(32, 32, 32, 64)
    self.padding=(20, 10)
    self.wrapping=15

    self.font_path = '/content/drive/MyDrive/Labo/Fonts/MEIRYO.TTC'
    self.video_path = '/content/drive/MyDrive/Labo/001.mp4'
    self.csv_path = '/content/drive/MyDrive/Labo/vid_data/001.csv'
    self.out_dist = '/content/drive/MyDrive/Labo/'
    self.out_name = 'out.mp4'
    self.gcs_key_path = '/content/drive/MyDrive/Labo/gcloud_secret_key.json'
    self.gcs_bucket_name = 'wits-labo-kwmr'

    self.posType = CapPosType.OUT
    self.dispType = CapDispType.ALL
    self.disp_name = True
    self.disp_face = True
    self.change_color = True

    #自動生成
    self.font = ImageFont.truetype(self.font_path, self.font_size)
    self.out_path = self.out_dist + self.out_name

  def __str__(self):
    return f"video path={self.video_path}, csv path={self.csv_path}, out path={self.out_path}"
  def __repr__(self):
    return self.__str__()

## メインで動かすところ

### 各種読み込み・動画の処理

In [None]:
m=Manager()
video = VideoCapture(m) #ユーザ定義関数でvideoの情報を入手
#GenerateCaption(m, video) #自動生成の場合は実行
cs = Captions(video.fps, m)

#print(m.out_path, cv2.VideoWriter_fourcc(*'mp4v'), video.fps, (video.width, video.height))
out = cv2.VideoWriter(m.out_path, cv2.VideoWriter_fourcc(*'mp4v'), video.fps, (video.width, video.height))

#各フレームごとに処理
for i in tqdm(range(video.frame_count)):
  ret, frame = video.cap.read()
  if not ret:
    break

  this_cs = cs.getCaptions(i, m.wrapping)

  bg_image = Image.new('RGBA', (video.width, video.height), m.bg)

  #out.write()はBGRしか扱えない！
  if this_cs != None:
    frame = PutText(m, video, this_cs, frame, bg_image) #BGRが返ってくる
    out.write(frame)
  else:
    img = Image.fromarray(frame)
    img.putalpha(255)
    bg_image.paste(img, (0, 0))
    frame = np.array(bg_image) #BGRAになる
    frame = cv2.resize(frame, (video.width, video.height))
    frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR) #ここでBGRに戻る
    out.write(frame)

  i += 1

out.release()

  0%|          | 0/705 [00:00<?, ?it/s]

### 動画に音声を追加

In [None]:
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip

video_clip = VideoFileClip(m.out_path)
audio_clip = AudioFileClip(m.video_path)
final_audio = CompositeAudioClip([audio_clip])
final_clip = video_clip.set_audio(final_audio)
final_clip.write_videofile(m.out_dist + "out2.mp4")

Moviepy - Building video /content/drive/MyDrive/Labo/out2.mp4.
MoviePy - Writing audio in out2TEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video /content/drive/MyDrive/Labo/out2.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/Labo/out2.mp4


## 動画を表示

In [None]:
#from IPython.display import Video
#Video("/content/drive/MyDrive/Labo/out.mp4", embed=True)