In [16]:
import pytube
import pydub
import cv2
import PIL
import pytesseract
import cnocr
import zhon
import string
import cnocr
import os
import re


os.environ["IMAGEIO_FFMPEG_EXE"] = "/opt/homebrew/bin/ffmpeg"

import moviepy.editor as mpeditor
import moviepy.video.fx.all as mpfx

target_dir = "./all_data"
video_fps = 5
failfile = open(target_dir + "/fail.txt", "w")
list_url = "https://www.youtube.com/playlist?list=PL96kIIcXJpMvasYY-YOVv4ZgmqaDd9e5x"

In [17]:
def cleartext(text):
    text = re.sub(r"\s+", "", text)
    text = re.sub("\(.*?\)","", text)
    text = re.sub("[" + string.punctuation + "]", "", text)
    text = re.sub("[" + zhon.hanzi.punctuation + "]", "", text)
    return text


def is_space(text):
    if text == "" or len(text) < 3: 
        return True
    for c in text:
        if text.count(c) > 3:
            return True
    return False


def text_is_differ(s1, s2):
    n1 = len(s1)
    n2 = len(s2)

    dp = [[None] * (n2 + 1) for i in range(n1 + 1)]

    for i in range(n1 + 1):
        for j in range(n2 + 1):
            if i == 0 or j == 0 :
                dp[i][j] = 0
            elif s1[i - 1] == s2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
                
    return dp[n1][n2] < n1 - 5



def get_subtitle(in_path, out_path):
    video = cv2.VideoCapture(in_path)
    total_frame = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    subtitle = open(out_path, "w")

    pre_text = ""
    duration_time = []
    start_time = 0
    end_time = 0
    kernel2 = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    kernel3 = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))

    for i in range(total_frame):
        success, img = video.read()
        if not success:
            break
        
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY);
        ret, img = cv2.threshold(img, 240, 255, cv2.THRESH_BINARY)
        img = cv2.bitwise_not(img)  

        img = cv2.erode(img, kernel3)
        img = cv2.dilate(img, kernel2)

        cv2.imshow("video", img)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

        img = PIL.Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        text = pytesseract.image_to_string(img, lang='chi_tra', config="--psm 7")

        text = cleartext(text)

        if text_is_differ(text, pre_text):
            if not is_space(pre_text):
                end_time = round(i / video_fps, 2)
                subtitle.write("\"" + str(start_time) + "-" + str(end_time) + "\",\"" + pre_text + "\"\n")
                duration_time.append([start_time, end_time])
            start_time = round(i / video_fps, 2)


        pre_text = text

    if not is_space(text):
        end_time = round(total_frame / video_fps, 2)
        subtitle.write(str(start_time) + "~" + str(end_time) + ":" + pre_text + "\n")
        duration_time.append([start_time, end_time])

    
    subtitle.close()
    return duration_time

def processing(idx, video):
    print(idx, video.title, "running")
    tmp_video = target_dir + "/mp4/tmp" + str(idx) + ".mp4"
    mp4_path = target_dir + "/mp4/" + str(idx) + ".mp4"
    wav_path = target_dir + "/wav/" + str(idx) + ".wav"
    txt_path = target_dir + "/csv/" + str(idx) + ".csv"

    if not os.path.isfile(mp4_path):
        try:
            video.streams.filter().get_highest_resolution().download(filename="tmp" + str(idx) + ".mp4", output_path=target_dir + "/mp4")
            mp4 = mpeditor.VideoFileClip(tmp_video)
            mp4 = mpfx.crop(mp4, x1=350, y1=550, width=600, height=50)
            mp4 = mp4.subclip(0, mp4.duration - 2)
            mp4.write_videofile(filename=mp4_path, fps=video_fps, logger=None)
            os.remove(tmp_video)

        except:
            print(video.title, " cant downlaod")
            failfile.write(video.title + " cant downlaod.\n")
            return
    

    if not os.path.isfile(wav_path):
        try:
            audio = mpeditor.AudioFileClip(mp4_path)
            audio.write_audiofile(filename=wav_path, fps=16000, nbytes=2, logger=None)
            sound = pydub.AudioSegment.from_wav(wav_path)
            sound = sound.set_channels(1)
            sound.export(wav_path, format="wav")
        except:
            print(video.title, " cant convert")
            failfile.write(video.title + " cant convert.\n")
            return 

    
    if not os.path.isfile(txt_path):
        try:
            duration_time = get_subtitle(mp4_path, txt_path)
        except:
            print(video.title, " cant get subtitle")
            failfile.write(video.title + "  get subtitle.\n")
            return
        
    print(idx, "success")
    

    

In [18]:


if not os.path.isdir(target_dir + "/mp4"):
    os.makedirs(target_dir + "/mp4")
if not os.path.isdir(target_dir + "/wav"):
    os.makedirs(target_dir + "/wav")
if not os.path.isdir(target_dir + "/txt"):
    os.makedirs(target_dir + "/txt")

playlist = pytube.Playlist(list_url)

print("start")
for [idx, video] in enumerate(playlist.videos):
    if idx == 1:
        break
    print(idx, video.title, "running")
    
    processing(idx, video)
    print(idx, video.title, "done")


failfile.close()
    



start
0 部落「浴火重生」 原客夫妻回鄉共營咖啡店【客家新聞20220812】 running
0 部落「浴火重生」 原客夫妻回鄉共營咖啡店【客家新聞20220812】 running
