## <h1> English version of audio to text

### <h2> 1. Enviroment & dependency setup

In [None]:
!pip3 install transformer

### <h2> 2. function definition

In [1]:
# function block
import os
import librosa
import torch
import numpy as np
from docx import Document
from transformers import pipeline 
from transformers import WhisperModel, WhisperConfig, WhisperFeatureExtractor
from IPython.display import clear_output



def convert_video_to_text(input_file, output_file, output_format, time_flag):
    # Load the speech-to-text model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()
    pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=device)
    # Convert video to audio
    audio_file = convert_video_to_audio(input_file)
    with open(output_file, "a") as f:
        f.write(f"{input_file}:\n")
    for i in audio_file:
        transcription = transcribe_audio_to_text(device, i, pipe)
        save_transcription_to_text(i, transcription, output_file,time_flag)
    os.system(f'rm -rf *.wav')
    print(f"{input_file} completed")
                            
def convert_video_to_audio(input_file):
    # Convert video to audio using ffmpeg
    audio_file = "temp.wav"
    os.system(f"ffmpeg -i {input_file} -vn -acodec pcm_s16le -ar 16000 -ac 1 {audio_file}")
    
    # Split audio into 5 minute segments
    duration = librosa.get_duration(filename=audio_file)
    split_files = []
    for i in range(0, int(duration), 100):
        split_file = f"split_{i}.wav"
        os.system(f"ffmpeg -i {audio_file} -ss {i} -t 100 {split_file}")
        split_files.append(split_file)
    
    return tuple(split_files)


def transcribe_audio_to_text(device, audio_file, pipeline):
    # Load audio file
    audio_input, _ = librosa.load(audio_file, sr=16000)

    # Transcribe audio to text
    transcription = pipeline(audio_input, max_new_tokens=256, generate_kwargs={"task":"translate"},
                        chunk_length_s=30,
                        batch_size  = 8,
                        )

    return transcription

def save_transcription_to_text(i, transcription, output_file, time_flag):
    # Save transcription to text file
    tmp, _ = i.split(".")
    _, time = tmp.split("_")
    with open(output_file, "a") as file:
        if transcription:
            if not time_flag:
                content = transcription["text"]
                file.write(f"{content}\n")
            else: 
                content = transcription["text"]

                file.write(f"Time: {time}s  Content: {content}\n")
    os.system(f'rm -rf {i}')



  from .autonotebook import tqdm as notebook_tqdm


### <h2> 3. folder run

In [10]:
# Control block for single folder

# Input file folder
input_folder = "input"

# Output file folder
output_folder = "output"

# Choose output format
output_format = "text" 

# Choose whether output timestamp
time_flag = True

In [11]:

# Print the current working directory
print(os.getcwd())

/home/lwsze/projects/video-2-text


In [12]:
# split super large file
# if time of the file > 30 min, split it inside the folder into 20 min each

from moviepy.editor import VideoFileClip

current_directory = os.getcwd()
for filename in os.listdir(input_folder):
    in_file_path = os.path.join(input_folder, filename)
    if os.path.isfile(in_file_path) and in_file_path.endswith(".mp4"):
        if VideoFileClip(in_file_path).duration > 1200:
            audio_file = f"{in_file_path.strip('.mp4')}.wav"
            os.system(f"ffmpeg -i {in_file_path} -vn -acodec pcm_s16le -ar 16000 -ac 1 {audio_file}")
            os.system(f'rm -rf {in_file_path}')
            duration = librosa.get_duration(filename=audio_file)
            for i in range(0, int(duration), 1200):
                print("inside loop")
                split_file = f"{audio_file.strip('.wav')}_split_{i}.wav"
                os.system(f"ffmpeg -i {audio_file} -ss {i} -t 1200 {split_file}")
            os.system(f'rm -rf {audio_file}')
clear_output()
print(f"Split file completed on {input_folder}")           
        

Split file completed on input


In [13]:
# folder transcription

for filename in os.listdir(input_folder):
    in_file_path = os.path.join(input_folder, filename)
    out_file_path = os.path.join(output_folder, f"{filename.strip('.mp4').strip('.wav')}.txt")
    if os.path.isfile(in_file_path):
        print(f"{in_file_path} is processing")
        convert_video_to_text(in_file_path, out_file_path, output_format, time_flag)

clear_output()
print("All transcription completed")
torch.cuda.empty_cache()

All transcription completed


### <h2> 4. Merge up file tools

In [14]:
# This is optional code block for merging split file previously made
# Not for small file < 20 min that does not require a split

In [15]:
import os
import re
from collections import defaultdict
folder_path = output_folder

# Acquire all files inside the folder using list comprehension
files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file))]

# for i in files:
#     print(i)
# print("end of file dir")


def copy_files(file_list, output_file):
    with open(output_file, 'w') as output:
        for file_name in file_list:
            with open(file_name, 'r') as file:
                output.write(file.read())
                output.write('\n')  # Add a newline between file contents

exp = "(?P<name>.*?)_split_(?P<session>.*?).txt"
pattern = re.compile(exp)
dict = defaultdict(list)

# Print the list of files
for file in files:
    result = re.search(pattern, file)
    if result:
        dict[result.group("name")].append(result.group("session"))

for name, session in dict.items():
    # print(f"{name}: {sorted(session)}")
    newFileName = f"{name}_merged"
    originalFileName = [name+"_split_"+i+".txt" for i in sorted(session)]
    copy_files(originalFileName,newFileName)
    for file in originalFileName:
        os.remove(file)

print("Merging completed")

### <h2> 5. file run

In [None]:
# using re to merge files

In [None]:
# Control block for single file

# Input file location
input_file = "input/input.mp4"

# Output file location
output_file = "output/output.txt"

# Choose output format
output_format = "text" 

# Choose whether output timestamp
time_flag = True

In [None]:
# single file transcription
convert_video_to_text(input_file, output_file, output_format, time_flag)
clear_output()
print("All transcription completed")

All transcription completed


### <h2> 6. Option: Remove file and resources

In [1]:
! rm -rf *.wav

In [None]:
! rm -rf input/*.mp4

In [None]:
! rm -rf input/*

In [None]:
! rm -rf output/*