In [1]:
import pandas as pd
from pytube import YouTube
import pydub
import os
import re
import glob

os.environ["IMAGEIO_FFMPEG_EXE"] = "/opt/homebrew/bin/ffmpeg"

import moviepy.editor as mp



In [2]:
target_dir = "./data"

data = pd.read_csv(target_dir + "/url.csv")

In [3]:
def download_video(tag, path):
    url = "https://www.youtube.com/watch?v=" + tag
    yt = YouTube(url, use_oauth=True, allow_oauth_cache=True)
    yt.streams.filter(file_extension='mp4').first().download(path)

    video = yt.streams.filter(only_audio=True).first()
    video.download(output_path=path)


def rename_file(name, i):
    os.rename("./data/mp4/" + name + ".mp4", "./data/mp4/" + str(i) + ".mp4")


def mp4_to_mp3(in_path, out_path):
    audio = mp.AudioFileClip(in_path)
    audio.write_audiofile(out_path)


def cut_descriptrion(in_path, out_path):
    mp3 = pydub.AudioSegment.from_mp3(in_path)

    slice = pydub.silence.split_on_silence(mp3, silence_thresh = -38, min_silence_len = 800, keep_silence = 1000)

    for i in range(len(slice)):
        if slice[0].duration_seconds <= 15 and slice[0].duration_seconds + slice[i + 1].duration_seconds < 60:
            slice[0] = slice[0].append(slice[i + 1])

    if slice[0].duration_seconds <= 15 or 60 <= slice[0].duration_seconds:
        raise Exception("cut fail")

    slice[0].export(out_path, format="mp3")

In [4]:
if not os.path.isdir(target_dir + "/mp4"):
    os.makedirs(target_dir + "/mp4")
if not os.path.isdir(target_dir + "/mp3"):
    os.makedirs(target_dir + "/mp3")
if not os.path.isdir(target_dir + "/Description"):
    os.makedirs(target_dir + "/Description")


traincsv = open(target_dir + "/train.csv", "w")
traincsv.write("\"file\",\"Description\"\n")
failfile = open(target_dir + "/fail.txt", "w")


for i in range(len(data)):
    if not os.path.isfile(target_dir + "/description/" + str(i) + ".mp3"):
        
        tag = data["ID"][i]
        video_name = re.sub(r'[.\\/*?:"<>|%]', "", data["Title"][i])
        mp4_path = target_dir + "/mp4/" + str(i) + ".mp4"
        mp3_path = target_dir + "/mp3/" + str(i) + ".mp3"
        description_path = target_dir + "/description/" + str(i) + ".mp3"
        
        try:
            download_video(tag, target_dir + "/mp4")
        except:
            print(i, data["Title"][i], "cant downlaod")
            failfile.write(str(i) + ". " + data["Title"][i] + " cant downlaod.\n")
            continue

        try:
            rename_file(video_name, i)
        except:
            print(i, data["Title"][i], "cant rename")
            failfile.write(str(i) + ". " + data["Title"][i] + " cant rename.\n")
            continue

        try:
            mp4_to_mp3(mp4_path, mp3_path)
        except:
            print(i, data["Title"][i], "cant convert")
            failfile.write(str(i) + ". " + data["Title"][i] + " cant convert.\n")
            continue

        try:
            cut_descriptrion(mp3_path, description_path)
        except:
            print(i, data["Title"][i], "cant cut")
            failfile.write(str(i) + ". " + data["Title"][i] + " cant cut.\n")
            continue
    
    try:
        description = re.sub(r'[「」『』()《》.\\/*:"<>|%#\\s]', "", data["Description"][i])
        traincsv.write("\"" + str(i) + ".mp3\",\"" + description + "\"\n")
    except:
        print(i, data["Title"][i], "cant write train.txt")
        failfile.write(str(i) + ". " + data["Title"][i] + " cant write train.txt.\n")
        continue

    print(i, data["Title"][i], "success")

traincsv.close()
failfile.close()
    



1570 赤科山聯外道路經常損毀 地方提升級縣道【客家新聞20230101】 success
MoviePy - Writing audio in ./data/mp3/1571.mp3


                                                                      

MoviePy - Done.
1571 竹縣元旦升旗 反生命園區自救會遞陳情書【客家新聞20230101】 success
MoviePy - Writing audio in ./data/mp3/1572.mp3


                                                                      

MoviePy - Done.
1572 剪黏泥塑技術保存者徐明河辭世 各界緬懷【客家新聞20230719】 success
MoviePy - Writing audio in ./data/mp3/1573.mp3


                                                                      

MoviePy - Done.
1573 員林7成客底 客委會補助多元課程要尋根【客家新聞20230719】 success
MoviePy - Writing audio in ./data/mp3/1574.mp3


                                                                      

MoviePy - Done.
1574 漫遊香草銅鑼 社區推敬老護幼體驗活動【客家新聞20230719】 success
MoviePy - Writing audio in ./data/mp3/1575.mp3


                                                                      

MoviePy - Done.
1575 新園鄉圖書新館動土 擴大面積.明年底完工【客家新聞20230719】 success
MoviePy - Writing audio in ./data/mp3/1576.mp3


                                                                      

MoviePy - Done.
1576 竹縣蓮花推廣協會前理事長 種大王蓮20年【客家新聞20230719】 success
MoviePy - Writing audio in ./data/mp3/1577.mp3


                                                                      

MoviePy - Done.
1577 國姓咖啡節移師綠雕園區 盼帶動地方發展【客家新聞20230719】 success
MoviePy - Writing audio in ./data/mp3/1578.mp3


                                                                      

MoviePy - Done.
1578 慶祝觀光工廠推行20週年 推特色嘉年華會【客家新聞20230719】 success
MoviePy - Writing audio in ./data/mp3/1579.mp3


                                                                      

MoviePy - Done.
1579 後龍聚善堂吉日啟用 頭份爭取濱江街拓寬【客家新聞20230719】 success
1580 南非工業化程度高 深陷能源危機缺電嚴重【客家新聞20230720】 cant downlaod
MoviePy - Writing audio in ./data/mp3/1581.mp3


                                                                      

MoviePy - Done.
1581 職場性平調查 僅1成5企業具性別友善政策【客家新聞20230720】 success
MoviePy - Writing audio in ./data/mp3/1582.mp3


                                                                      

MoviePy - Done.
1582 三義樂齡長者彩繪牆 一筆一畫在地美景【客家新聞20230720】 success
MoviePy - Writing audio in ./data/mp3/1583.mp3


                                                                      

MoviePy - Done.
1583 年產值4.5億元 首屆「美濃野菜節」7/22登場【客家新聞20230722】 success
MoviePy - Writing audio in ./data/mp3/1584.mp3


                                                                      

MoviePy - Done.
1584 東專與小農合推食農教育 推廣吃當地當季【客家新聞20230722】 success
MoviePy - Writing audio in ./data/mp3/1585.mp3


                                                                      

MoviePy - Done.
1585 臺東獨立書店和民間組織 合辦客語廣播營【客家新聞20230720】 success
MoviePy - Writing audio in ./data/mp3/1586.mp3


                                                                      

MoviePy - Done.
1586 高師大附中「海洋攔阻網」 獲美公益設計獎【客家新聞20230720】 success
MoviePy - Writing audio in ./data/mp3/1587.mp3


                                                                      

MoviePy - Done.
1587 頭份蜂蜜換新裝 軟管形式擠蜂蜜像擠牙膏【客家新聞20230720】 success
