In [65]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##install

In [66]:
#%pip install --upgrade pip --user
#%pip install moviepy==1.0.3
#%pip install opencv-python
#%pip install google-colab
#%pip install ffmpeg
#%pip install mecab-python3
#%pip install unidic-lite
#%pip install --upgrade google-cloud-speech
%run "/content/drive/MyDrive/Labo/STT.ipynb"



## import

In [67]:
#「動画、字幕の開始地点・終了地点・テキストを与えると字幕を付けてくれるシステム」作るぞ
import cv2
from google.colab.patches import cv2_imshow
import numpy as np

#progress barに必要
from tqdm.notebook import tqdm

#日本語の使用に必要
from PIL import Image, ImageDraw, ImageFont

#PIL型の画像を表示する
from IPython.display import display

#csv
import csv

from enum import Enum

## クラス定義

### CapPosType, CapDispType

In [68]:
#字幕の表示位置
class CapPosType(Enum):
  #画面下部
  BOTTOM = 0
  #人物下部
  BOTTOM_PERSON = 1
  #画面外
  OUT = 2

#字幕の表示方法
class CapDispType(Enum):
  #逐次、一括、歌詞
  SEQ = 0
  ALL = 1
  LYRIC = 2

### Timestamp

In [69]:
class Timestamp:
  text=""
  time=0.0
  def __init__(self, text, time):
    self.text=text
    self.time=time
  def __str__(self):
    return f"{self.text} in {self.time}"
  def __repr__(self):
    return self.__str__()

### Caption

In [70]:
class Caption:
  start = 0
  end = 0
  text = ""
  speaker = ""
  origin=(0,0)
  lefttop = (0,0)
  rightbottom = (0,0)
  speaker_id = 0
  timestamp=[]
  '''

  '''
  disp_text = ""

  def __init__(self, start, end, text, speaker, id, origin=(0,0)):
    self.timestamp = []
    self.start = start
    self.end = end
    self.text = text
    self.speaker = speaker + "≫" if speaker != "" else "　"
    self.origin=origin
    self.speaker_id = id
  def __str__(self):
    return f"start:{self.start}, end:{self.end}, text:{self.text}, speaker:{self.speaker}"
  def __repr__(self):
    return self.__str__()

  def getWords(self, frame_time) -> str:
    """
    フレームIDから字幕を取得する関数：該当なしならNoneを返す
    """
    #print(self.timestamp, frame_time)
    words_in_frame = ""
    for w in self.timestamp:
      if w.time <= frame_time:
        words_in_frame += w.text
    if len(words_in_frame) > 0:
      return words_in_frame
    else:
      return ""

### Captions

In [71]:
class Captions:
  """
    字幕を定義するクラス
  """

  _captions = []
  _dcaptions = ""

  def __init__(self, fps):

    self._captions=[]

    with open(m.csv_path, 'r') as f:
      reader = csv.reader(f)
      lines = [row for row in reader]
      for line in lines:
        line = remove_empty_elements(line, 7)
        #最初の行は無視
        if line[0] == 'start_time':
          continue

        # 字幕の大きさを計算
        dummy_text = line[2]+line[3]+"≫"
        left, top, right, bottom = m.font.getbbox(dummy_text)
        text_height = bottom - top

        buffer = m.padding[1] * 2
        text_height += buffer

        #表示位置をここで決める
        if m.posType == CapPosType.BOTTOM_PERSON:
          #何も指定されない場合はBOTTOM、指定されたらそれに従う
          if line[5] == "":
            origin=(50,round(video.height) - text_height - m.padding[1])
          else:
            #ここで渡すのは字幕の「「中心座標」」
            x = int(line[5]) + video.width // 2 - (right - left) // 2
            y = int(line[6]) + video.height // 2 - (bottom - top) // 2

            # はみ出し対策
            x = max(x, m.padding[0])
            x = min(x, video.width - (right - left) - m.padding[0])
            y = max(y, m.padding[1])
            y = min(y, video.height - (bottom - top) - m.padding[1])

            origin = (x, y)
        else:
          origin=(50,round(video.height) - text_height - m.padding[1])

        c = Caption(int(float(line[0])*fps), int(float(line[1])*fps), line[2], line[3], int(line[4]), origin)

        #単語ごとのタイムスタンプを考慮するなら読み込む
        if m.dispType == CapDispType.LYRIC or m.dispType == CapDispType.SEQ:
          for i in range(7, len(line), 2):
            c.timestamp.append(Timestamp(line[i], float(line[i+1])))

        self._captions.append(c)

    print("caption file successfully loaded")

  def getCaptions(self, frameid, video_fps):
    """
    フレームIDから字幕を取得する関数：該当なしならNoneを返す
    """
    captions_in_frame=[]
    #{frameid}番目のフレームで表示される字幕をcaptions_in_frameに代入
    for c in self._captions:
      if c.start <= frameid and c.end >= frameid:
        captions_in_frame.append(c)
      #表示する期間を長めにとって、表示しきった後も残るように（SEQ_dispTime秒）
      if m.dispType == CapDispType.SEQ and c.end < frameid and c.end + video_fps*m.SEQ_dispTime >= frameid:
        captions_in_frame.append(c)


    if len(captions_in_frame) > 0:
      #print(str(c) for c in captions_in_frame)
      for c in captions_in_frame:
        #{frameid}番目のフレームで表示されるテキスト(単語単位)
        c.disp_text = c.getWords(frameid / video_fps)
      return captions_in_frame
    else:
      return None

  def __str__(self):
    return f"captions:{self._captions}"
  def __repr__(self):
    return self.__str__()

### VideoCapture

In [72]:
class VideoCapture:
  """
  動画を読み込むクラス
  """
  def __init__(self):
    self.path = m.video_path
    self.cap = cv2.VideoCapture(self.path)
    self.frame_count = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
    self.fps = self.cap.get(cv2.CAP_PROP_FPS)
    self.width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    if m.posType == CapPosType.OUT:
      self.height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)*1.3)
    else:
      self.height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print("video successfully loaded")
    print(f"path:{self.path}, frame_count:{self.frame_count}, fps:{self.fps}, width:{self.width}, height:{self.height}")

  def __str__(self):
    return f"path:{self.path}, frame_count:{self.frame_count}, fps:{self.fps}, width:{self.width}, height:{self.height}"
  def __repr__(self):
    return self.__str__()

## メソッド定義

### create_overlay(), draw_text_outline(), draw_text()

In [73]:
def create_overlay(img_size, origin, text_bbox, padding, text_bg):
  """
  字幕の背景となるオーバーレイを作成する関数
  """
  overlay = Image.new('RGBA', img_size, (0,0,0,0))
  draw = ImageDraw.Draw(overlay)
  left, top, right, bottom = text_bbox
  draw.rectangle(((origin[0]+left-padding[0], origin[1]+top-padding[1]),
                  (origin[0]+right+padding[0], origin[1]+bottom+padding[1])),
                  fill=tuple(text_bg))
  return overlay

def draw_text_outline(draw, origin, text, font, font_outline_color):
  """
  テキストの輪郭を描画する関数
  """

  draw.text((origin[0]+1, origin[1]), text, font=font, fill=tuple(font_outline_color))
  draw.text((origin[0]-1, origin[1]), text, font=font, fill=tuple(font_outline_color))
  draw.text((origin[0], origin[1]+1), text, font=font, fill=tuple(font_outline_color))
  draw.text((origin[0], origin[1]-1), text, font=font, fill=tuple(font_outline_color))

def draw_text(draw, origin, text, speaker_id):
  """
  テキストを描画する関数
  m.change_color次第で文字の色も変える
  """
  if not m.change_color:
    draw.text(origin, text, font=m.font, fill=tuple(m.font_colors[4]))
  else:
    draw.text(origin, text, font=m.font, fill=tuple(m.font_colors[speaker_id]))
  #draw.rectangle((origin[0],origin[1],origin[0]+3,origin[1]+3), fill=tuple(font_color))

### PutText()

In [74]:
def PutText(video, this_cs, frame, bg_image):
  """
  フレームに字幕を付ける関数
  """

  # frameを背景画像に貼り付け
  img = Image.fromarray(frame)
  img.putalpha(255)
  bg_image.paste(img, (0, 0))

  offset = 0 # y座標のオフセット

  #このフレーム内で表示される字幕に関して、字幕を描画
  for i, c in reversed(list(enumerate(this_cs))):
    #字幕の描画：逐次表示
    if m.dispType == CapDispType.SEQ:
      # 字幕の大きさを計算
      #print(c.disp_text)
      if c.disp_text == "":
        continue
      text = c.speaker + c.disp_text if m.disp_name else c.disp_text #話者と現在表示中の字幕までのbboxの大きさを計算するため
      left, top, right, bottom = m.font.getbbox(text)
      text_height = bottom - top

      # 表示位置を計算
      origin = [c.origin[0], c.origin[1] - offset]

      #背景
      overlay = create_overlay(bg_image.size, origin, (left, top, right, bottom), m.padding, m.text_bg)
      bg_image = Image.alpha_composite(bg_image, overlay)
      #縁取り
      draw_text_outline(ImageDraw.Draw(bg_image), origin, text, m.font, m.font_outline_color)
      #テキスト
      draw_text(ImageDraw.Draw(bg_image), origin, text, c.speaker_id)

      offset += text_height + m.padding[1] * 2


    #字幕の描画：事前表示
    elif m.dispType == CapDispType.LYRIC:
      # 字幕の大きさを計算
      text = c.speaker + c.text if m.disp_name else c.text
      left, top, right, bottom = m.font.getbbox(text)
      text_height = bottom - top

      # 表示位置を計算
      origin = [c.origin[0], c.origin[1] - offset]

      #背景
      overlay = create_overlay(bg_image.size, origin, (left, top, right, bottom), m.padding, m.text_bg)
      bg_image = Image.alpha_composite(bg_image, overlay)

      for w in c.timestamp:
        #文字ごとのoriginを計算
        disp_bbox=m.font.getbbox(c.disp_text)
        grayout_origin=[origin[0]+disp_bbox[0], origin[1]+disp_bbox[1]]

        #グレーアウトしたテキスト
        grayout_text = (c.speaker + c.text).removeprefix(c.speaker + c.disp_text) if m.disp_name else c.text.removeprefix(c.disp_text)
        #縁取り
        draw_text_outline(ImageDraw.Draw(bg_image), origin, c.disp_text, m.font, m.font_outline_color)
        draw_text_outline(ImageDraw.Draw(bg_image), grayout_origin, grayout_text, m.font, m.font_outline_color)
        #テキスト
        draw_text(ImageDraw.Draw(bg_image), origin, c.disp_text, c.speaker_id)
        draw_text(ImageDraw.Draw(bg_image), grayout_origin, grayout_text, c.speaker_id+5)

      offset += text_height + m.padding[1] * 2

    #字幕の描画：一括表示
    else:
      # 字幕の大きさを計算
      text = c.speaker + c.text if m.disp_name else c.text
      left, top, right, bottom = m.font.getbbox(text)
      text_height = bottom - top

      # 表示位置を計算
      origin = [c.origin[0], c.origin[1] - offset]

      #背景
      overlay = create_overlay(bg_image.size, origin, (left, top, right, bottom), m.padding, m.text_bg)
      bg_image = Image.alpha_composite(bg_image, overlay)
      #縁取り
      draw_text_outline(ImageDraw.Draw(bg_image), origin, text, m.font, m.font_outline_color)
      #テキスト
      draw_text(ImageDraw.Draw(bg_image), origin, text, c.speaker_id)

      #draw_text(ImageDraw.Draw(bg_image), origin, text, c.speaker_id)
      offset += text_height + m.padding[1] * 2

  frame = np.array(bg_image) #BGRAになる
  frame = cv2.resize(frame, (video.width, video.height))
  frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR) #ここでBGRに戻る
  return frame

### GenerateCaption()

In [75]:
def GenerateCaption(video):
  """
  字幕を生成する関数
  """
  t = transcribe(local_video_path=m.video_path,
               bucket_name=m.gcs_bucket_name,
               gcs_key_path=m.gcs_key_path,
               wav_name="tmp.wav",
               csv_path=m.csv_path,
               disp_all=m.dispType==CapDispType.ALL)

  return t

### remove_empty_elements()

In [76]:
def remove_empty_elements(arr, threshold_index):
    """
    配列のデータのうち、indexが一定値以上かつ空である要素のみを削除する関数

    Args:
        arr: 配列
        threshold_index: indexの閾値

    Returns:
        空の要素が削除された配列
    """

    result = []
    for i, element in enumerate(arr):
        if i < threshold_index or element:  # indexが閾値未満、または要素が空でない場合
            result.append(element)
    return result

## 字幕全体の設定 (Manager)

In [77]:
class Manager:
  def __init__(self):
    #入力欄
    #色はBGR
    self.font_colors=((0, 255, 255, 255),     #黄色
                      (0, 255, 0, 255),       #緑
                      (32, 128, 255, 255),    #青
                      (73, 38, 187, 255),     #赤マゼンタ
                      (255, 255, 255, 255),   #白
                      (0, 128, 128, 255),     #グレー黄色
                      (0, 128, 0, 255),       #グレー緑
                      (16, 64, 128, 255),    #グレー青
                      (37, 19, 94, 255),     #グレー赤マゼンタ
                      (128, 128, 128, 255))   #グレー

    self.font_outline_color=(0, 0, 0, 255)
    self.font_size=50
    self.bg=(0, 0, 0, 255)
    self.text_bg=(32, 32, 32, 64)
    self.padding=(20, 10)
    self.wrapping=15

    self.font_path = '/content/drive/MyDrive/Labo/Fonts/MEIRYO.TTC'
    self.video_path = '/content/drive/MyDrive/Labo/001.mp4'
    self.csv_path = '/content/drive/MyDrive/Labo/vid_data/001.csv'
    self.out_dist = '/content/drive/MyDrive/Labo/'
    self.out_name = 'out.mp4'
    self.gcs_key_path = '/content/drive/MyDrive/Labo/gcloud_secret_key.json'
    self.gcs_bucket_name = 'wits-labo-kwmr'

    self.posType = CapPosType.BOTTOM
    self.dispType = CapDispType.SEQ
    self.disp_name = False
    self.disp_face = True
    self.change_color = False

    self.SEQ_dispTime = 2.0

    #自動生成
    self.font = ImageFont.truetype(self.font_path, self.font_size)
    self.out_path = self.out_dist + self.out_name

  def __str__(self):
    return f"video path={self.video_path}, csv path={self.csv_path}, out path={self.out_path}"
  def __repr__(self):
    return self.__str__()

## メインで動かすところ

### 各種読み込み・動画の処理

In [78]:
m=Manager()
video = VideoCapture() #ユーザ定義関数でvideoの情報を入手
#GenerateCaption(video) #自動生成の場合は実行
cs = Captions(video.fps)

#print(m.out_path, cv2.VideoWriter_fourcc(*'mp4v'), video.fps, (video.width, video.height))
out = cv2.VideoWriter(m.out_path, cv2.VideoWriter_fourcc(*'mp4v'), video.fps, (video.width, video.height))

#各フレームごとに処理
for i in tqdm(range(video.frame_count)):
  ret, frame = video.cap.read()
  if not ret:
    break

  this_cs = cs.getCaptions(i, video.fps)

  bg_image = Image.new('RGBA', (video.width, video.height), m.bg)

  #out.write()はBGRしか扱えない！
  if this_cs != None:
    frame = PutText(video, this_cs, frame, bg_image) #BGRが返ってくる
    out.write(frame)
  else:
    img = Image.fromarray(frame)
    img.putalpha(255)
    bg_image.paste(img, (0, 0))
    frame = np.array(bg_image) #BGRAになる
    frame = cv2.resize(frame, (video.width, video.height))
    frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR) #ここでBGRに戻る
    out.write(frame)

  i += 1

print("finished")
out.release()

video successfully loaded
path:/content/drive/MyDrive/Labo/001.mp4, frame_count:705, fps:30.0, width:1280, height:720
caption file successfully loaded


  0%|          | 0/705 [00:00<?, ?it/s]

finished


### 動画に音声を追加

In [79]:
"""from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip

video_clip = VideoFileClip(m.out_path)
audio_clip = AudioFileClip(m.video_path)
final_audio = CompositeAudioClip([audio_clip])
final_clip = video_clip.set_audio(final_audio)
final_clip.write_videofile(m.out_dist + "out2.mp4")"""

'from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip\n\nvideo_clip = VideoFileClip(m.out_path)\naudio_clip = AudioFileClip(m.video_path)\nfinal_audio = CompositeAudioClip([audio_clip])\nfinal_clip = video_clip.set_audio(final_audio)\nfinal_clip.write_videofile(m.out_dist + "out2.mp4")'

## 動画を表示

In [80]:
#from IPython.display import Video
#Video("/content/drive/MyDrive/Labo/out.mp4", embed=True)