## <h1> Chinese version of audio to text

### <h2> 1. Enviroment & dependency setup

In [19]:
!pip3 install -U funasr



In [20]:

!pip3 install modelscope



### <h2> 2. function definition

In [1]:
# function block
import os
import librosa
import torch
import numpy as np
from docx import Document
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from IPython.display import clear_output



def convert_video_to_text(input_file, output_file, output_format, time_flag):
    # Load the speech-to-text model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()
    pipe = pipeline(
        task=Tasks.auto_speech_recognition,
        model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
        model_revision="v1.2.4")
    # Convert video to audio
    audio_file = convert_video_to_audio(input_file)
        
    with open(output_file, "a") as f:
        f.write(f"{input_file}:\n")
    for i in audio_file:
        transcription = transcribe_audio_to_text(i, pipe)
        save_transcription_to_text(i, transcription, output_file,time_flag)
    os.system(f'rm -rf {input_file}/*.wav')
    print(f"{input_file} completed")
                            
def convert_video_to_audio(input_file):
    
    if input_file.endswith(".wav"):
        duration = librosa.get_duration(filename=input_file)
        split_files = []
        for i in range(0, int(duration), 100):
            split_file = f"split_{i}.wav"
            os.system(f"ffmpeg -i {input_file} -ss {i} -t 100 {split_file}")
            split_files.append(split_file)
        os.remove(input_file)
    else:
    
        # Convert video to audio using ffmpeg
        audio_file = "temp.wav"
        os.system(f"ffmpeg -i {input_file} -vn -acodec pcm_s16le -ar 16000 -ac 1 {audio_file}")

        # Split audio into 5 minute segments
        duration = librosa.get_duration(filename=audio_file)
        split_files = []
        for i in range(0, int(duration), 100):
            split_file = f"split_{i}.wav"
            os.system(f"ffmpeg -i {audio_file} -ss {i} -t 100 {split_file}")
            split_files.append(split_file)
        os.remove("temp.wav")
        os.remove(input_file)
    
    return tuple(split_files)


def transcribe_audio_to_text(audio_file, pipeline):
    # Load audio file
    audio_input, _ = librosa.load(audio_file, sr=16000)

    # Transcribe audio to text
    torch.cuda.empty_cache()
    transcription = pipeline(audio_input)
    return transcription

def save_transcription_to_text(i, transcription, output_file, time_flag):
    # Save transcription to text file
    tmp, _ = i.split(".")
    _, time = tmp.split("_")
    with open(output_file, "a") as file:
        if transcription:
            if not time_flag:
                content = transcription["text"]
                file.write(f"{content}\n")
            else: 
                content = transcription["text"]
                file.write(f"Time: {time}s  Content: {content}\n")
    os.system(f'rm -rf {i}')



  from .autonotebook import tqdm as notebook_tqdm
2023-09-20 22:49:07,924 - modelscope - INFO - PyTorch version 1.13.1+cu116 Found.
2023-09-20 22:49:07,924 - modelscope - INFO - Loading ast index from /home/lwsze/.cache/modelscope/ast_indexer
2023-09-20 22:49:08,068 - modelscope - INFO - Loading done! Current index file version is 1.7.1, with md5 166028151a1a39c52457a15775a869d4 and a total number of 861 components indexed


### <h2> 3. folder run

In [2]:
# Control block for single folder

# Input file folder
input_folder = "input"

# Output file folder
output_folder = "output"

# Choose output format
output_format = "text" 

# Choose whether output timestamp
time_flag = True

In [3]:

# Print the current working directory
print(os.getcwd())

/home/lwsze/projects/video-2-text


In [4]:
# split super large file
# if time of the file > 30 min, split it inside the folder into 20 min each

from moviepy.editor import VideoFileClip

current_directory = os.getcwd()
for filename in os.listdir(input_folder):
    in_file_path = os.path.join(input_folder, filename)
    if os.path.isfile(in_file_path) and in_file_path.endswith(".mp4"):
        if VideoFileClip(in_file_path).duration > 1200:
            audio_file = f"{in_file_path.strip('.mp4')}.wav"
            os.system(f"ffmpeg -i {in_file_path} -vn -acodec pcm_s16le -ar 16000 -ac 1 {audio_file}")
            os.system(f'rm -rf {in_file_path}')
            duration = librosa.get_duration(filename=audio_file)
            for i in range(0, int(duration), 1200):
                print("inside loop")
                split_file = f"{audio_file.strip('.wav')}_split_{i}.wav"
                os.system(f"ffmpeg -i {audio_file} -ss {i} -t 1200 {split_file}")
            os.system(f'rm -rf {audio_file}')
clear_output()
print(f"Split file completed on {input_folder}")           
        

Split file completed on input


In [5]:
# folder transcription

for filename in os.listdir(input_folder):
    in_file_path = os.path.join(input_folder, filename)
    out_file_path = os.path.join(output_folder, f"{filename.strip('.mp4').strip('.wav')}.txt")
    if os.path.isfile(in_file_path):
        print(f"{in_file_path} is processing")
        convert_video_to_text(in_file_path, out_file_path, output_format, time_flag)

clear_output()
print("All transcription completed")
torch.cuda.empty_cache()

All transcription completed


### <h2> 4. Merge up file tools

In [None]:
# This is optional code block for merging split file previously made
# Not for small file < 20 min that does not require a split

In [32]:
import os
import re
from collections import defaultdict
folder_path = output_folder

# Acquire all files inside the folder using list comprehension
files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]

# for i in files:
#     print(i)
# print("end of file dir")


def copy_files(file_list, output_file):
    with open(output_file, 'w') as output:
        for file_name in file_list:
            with open(file_name, 'r') as file:
                output.write(file.read())
                output.write('\n')  # Add a newline between file contents

exp = "(?P<name>.*?)_split_(?P<session>.*?).txt"
pattern = re.compile(exp)
dict = defaultdict(list)

# Print the list of files
for file in files:
    result = re.search(pattern, file)
    if result:
        dict[result.group("name")].append(result.group("session"))

for name, session in dict.items():
    # print(f"{name}: {sorted(session)}")
    newFileName = f"{name}_merged"
    originalFileName = [name+"_split_"+i+".txt" for i in sorted(session)]
    copy_files(originalFileName,newFileName)
    for file in originalFileName:
        os.remove(file)

### <h2> 5. file run

In [None]:
# using re to merge files

In [None]:
# Control block for single file

# Input file location
input_file = "input/input.mp4"

# Output file location
output_file = "output/output.txt"

# Choose output format
output_format = "text" 

# Choose whether output timestamp
time_flag = True

In [None]:
# single file transcription
convert_video_to_text(input_file, output_file, output_format, time_flag)
clear_output()
print("All transcription completed")

All transcription completed


### <h2> 6. Option: Remove file and resources

In [36]:
! rm -rf *.wav

In [37]:
! rm -rf input/*.mp4

In [40]:
! rm -rf input/*

In [41]:
! rm -rf output/*

### <h2> 7. Option: Summarization single file

In [43]:
fileToBeSum = "output" #file to summarize

In [45]:
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

p = pipeline(
    task=Tasks.extractive_summarization,
    model='damo/nlp_ponet_extractive-summarization_topic-level_chinese-base')

result = p(documents='移动端语音唤醒模型，检测关键词为“小云小云”。模型主体为4层FSMN结构，使用CTC训练准则，参数量750K，适用于移动端设备运行。模型输入为Fbank特征，输出为基于char建模的中文全集token预测，测试工具根据每一帧的预测数据进行后处理得到输入音频的实时检测结果。模型训练采用“basetrain + finetune”的模式，basetrain过程使用大量内部移动端数据，在此基础上，使用1万条设备端录制安静场景“小云小云”数据进行微调，得到最终面向业务的模型。后续用户可在basetrain模型基础上，使用其他关键词数据进行微调，得到新的语音唤醒模型，但暂时未开放模型finetune功能。')

print(result[OutputKeys.TEXT])

2023-08-06 23:50:13,414 - modelscope - INFO - Model revision not specified, use the latest revision: v1.0.0
2023-08-06 23:50:13,869 - modelscope - INFO - initiate model from /home/lwsze/.cache/modelscope/hub/damo/nlp_ponet_extractive-summarization_topic-level_chinese-base
2023-08-06 23:50:13,869 - modelscope - INFO - initiate model from location /home/lwsze/.cache/modelscope/hub/damo/nlp_ponet_extractive-summarization_topic-level_chinese-base.
2023-08-06 23:50:13,872 - modelscope - INFO - initialize model from /home/lwsze/.cache/modelscope/hub/damo/nlp_ponet_extractive-summarization_topic-level_chinese-base
You are using a model of type bert to instantiate a model of type ponet. This is not supported for all configurations of models and can yield errors.


模型输入为Fbank特征，输出为基于char建模的中文全集token预测，测试工具根据每一帧的预测数据进行后处理得到输入音频的实时检测结果。
模型训练采用“basetrain + finetune”的模式，basetrain过程使用大量内部移动端数据，在此基础上，使用1万条设备端录制安静场景“小云小云”数据进行微调，得到最终面向业务的模型。


### <h2> 8. Option: Summarization Multiple file