In [None]:
# !sudo apt-get -y install fuse3

In [None]:
# !sudo ln -s /bin/fusermount /bin/fusermount3

In [None]:
# !wget https://downloads.rclone.org/v1.64.2/rclone-v1.64.2-linux-amd64.deb
# !apt install ./rclone-v1.64.2-linux-amd64.deb

In [None]:
# #@markdown Run this cell, and in rclone config, select `n` for new remote, name it `onedrive` and check the number in the list corresponding to onedrive (`26` in the current version). Then press enter for `client_id` and for `client_secret`, then `n` to avoid avanced config and `n` for auto config.
# #@markdown Then paste the access token code generated in your machine, and select number 1 for "Onedrive personal" or "Onedrive business"
# #@markdown Then, in "found drives" check that the configuration is ok, depending on the one you want to mount. Then select `y` twice and `q` to quit the configuration.
# !rclone config

In [None]:
# !sudo mkdir /content/onedrive
# !nohup rclone --vfs-cache-mode writes mount onedrive: /content/onedrive &

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install mutagen
!pip install librosa
!pip install transformers
!pip install torch
!pip install https://github.com/kpu/kenlm/archive/master.zip
!pip install pyctcdecode

In [None]:
import torchaudio
import librosa
import numpy as np
from mutagen.mp3 import MP3
import requests
from typing import Tuple

class LocalLoader():
    def __init__(self, sampling_rate=16000, duration=120):
        self.sampling_rate = sampling_rate

        # decide duration for each time read an audio
        self.duration = duration

    def load(self, file_path: str, segment: int):
        '''
        Load audio with a specific segment.
        '''
        audio = None
        file_sampling_rate = None
        startingTime = self.duration*(segment - 1)

        if MP3(file_path).info.length >= startingTime:
            # if segment satisfied, read the segment
            audio, file_sampling_rate = librosa.load(file_path, sr=self.sampling_rate, offset = startingTime, duration=self.duration)
        else:
            # raise out of segment
            raise Exception('Out of segment at' + str(file_path))

        return audio, file_sampling_rate


class LocalSplitter(LocalLoader):
    def __init__(self, sampling_rate=16000, chunk_size=5, duration=120):
        assert duration % chunk_size == 0

        super().__init__(sampling_rate, duration)
        self.chunk_size = chunk_size

    def load(self, file_path: str, segment: int):
        '''
        Load and split the audio to some chunks (with a given size).
        The audio file is loaded with a specific segment and duration.
        '''
        #load audio
        fullAudio, file_sampling_rate = super().load(file_path,segment)
        #number of floats need to use to store a chunk of audio
        number_per_chunk = self.chunk_size * self.sampling_rate
        # the size of final chunk (the final chunk may be shorter than the others)
        finalChunk_size = fullAudio.shape[0] % number_per_chunk

        # read chunks from audio except for the final chunk
        audio_chunks = None
        if int(fullAudio.shape[0] / number_per_chunk) > 0:
            audio_chunks = fullAudio[0 : fullAudio.shape[0] - finalChunk_size].reshape(int(fullAudio.shape[0] / number_per_chunk), -1)

        # final chunk in the audio
        final_chunk = None
        # read the final chunk
        if finalChunk_size > 0:
            final_chunk = np.zeros(number_per_chunk)
            final_chunk[0 : finalChunk_size] = fullAudio[fullAudio.shape[0] - finalChunk_size : fullAudio.shape[0]]

        return audio_chunks, final_chunk

In [None]:
from mutagen.mp3 import MP3
import pathlib
import json
import sys
import torch

class FileTranscripter(LocalSplitter):
    def __init__(self, model_inferencer, processor_feature_extractor, processor_tokenizer_decoder, device, sampling_rate=16000, chunk_size=5,duration=120):
        super().__init__(sampling_rate, chunk_size, duration)
        # device
        self.device = device
        # model to use
        self.model_inferencer = model_inferencer
        self.processor_tokenizer_decoder = processor_tokenizer_decoder
        self.processor_feature_extractor = processor_feature_extractor


    def transcript_segment(self, file_path: str, segment: int):
        '''
        Transcript a a segment of audio (the first segment is 1)
        '''
        try:
            audio_chunks, final_audio_chunk = self.load(file_path= file_path, segment= segment)
        except Exception as exc:
            print(f'{file_path}, segment {segment} ', end='')
            print(str(exc))
            return None

        result = []

        # transcript splitted chunks in the segment
        if audio_chunks is not None:
            input_chunks = self.processor_feature_extractor(audio_chunks)
            input_chunks.to(self.device)
            output_chunks = self.model_inferencer(input_chunks)

            result = self.processor_tokenizer_decoder(output_chunks)

        # transcript the final chunk
        if final_audio_chunk is not None:
            input_final_chunk = self.processor_feature_extractor(final_audio_chunk)
            input_final_chunk.to(self.device)
            output_final_chunk = self.model_inferencer(input_final_chunk)

            result.append(self.processor_tokenizer_decoder(output_final_chunk)[0])

        return result


    def transcript_file(self, file_path: str, starting_segment: int):
        '''
        Transcript the audio strat from a specific segment by transcripting the segments one by one
        '''
        # duration of the audio file
        file_duration = MP3(file_path).info.length
        # number of segment that the audio file has
        num_of_segment = int(file_duration / self.duration) + int((file_duration - self.duration * int(file_duration / self.duration)) > 0)

        result = [0] * num_of_segment

        # transcript segment one by one
        for segment in range(starting_segment, num_of_segment + 1, 1):
            result[segment - 1] = self.transcript_segment(file_path, segment)

        return result


    def transcript_write_file(self, file_path: str, des_path: str, starting_segment: int, reading_state_path: str, stop_path :str):
        '''
        Transcript the audio strat from a specific segment by transcripting the segments one by one and write to a text file.
        Also, save the reading state to a json file.
        '''
        # duration of the audio file
        file_duration = MP3(file_path).info.length
        # number of segment that the audio file has
        num_of_segment = int(file_duration / self.duration) + int((file_duration - self.duration * int(file_duration / self.duration)) > 0)

        # load the reading state
        with open(reading_state_path, 'r') as state_read:
            reading_state = json.load(state_read)

        for segment in range(starting_segment, num_of_segment + 1, 1):
            # check if require to terminate
            with open(stop_path, 'r') as f:
                stop = int(f.read())

            if stop != 0:
                print('Termination required, check the stop')
                # update segment in reading state
                with open(reading_state_path, 'w') as state_write:
                    reading_state["final_read_segment"] = segment - 1
                    json.dump(reading_state, state_write)
                state_read.close()
                sys.exit()

            # transcript the segment
            segment_transcript = self.transcript_segment(file_path, segment)

            transcript  = ''
            for chunk_transcript in segment_transcript:
                transcript += (chunk_transcript + '\n')

            # write to the text file
            with open(des_path, 'a', encoding='utf8') as transcript_write:
                transcript_write.write(transcript)

            with open(reading_state_path, 'w') as state_write:
                  reading_state["final_read_segment"] = segment
                  json.dump(reading_state, state_write)


class FolderTranscripter(FileTranscripter):
    def __init__(self, model_inferencer, processor_feature_extractor, processor_tokenizer_decoder, device, instruction_path, reading_state_name):
        with open(instruction_path, 'r') as read_instruction:
            instruction = json.load(read_instruction)

        super().__init__(model_inferencer, processor_feature_extractor, processor_tokenizer_decoder, device, int(instruction["sampling_rate"]), int(instruction["chunk_size"]), int(instruction["duration"]))

        self.stop_path = instruction["stop_path"]
        self.data_folder_path = instruction["data_folder_path"]

        self.reading_state_path = pathlib.Path(self.data_folder_path) / reading_state_name
        data_folder = pathlib.Path(self.data_folder_path)

        #create a reading state file for a folder if not exists
        if not self.reading_state_path.exists():
            data_folder.touch(reading_state_name)
            print(f'Created reading_state for {self.data_folder_path}')

            initialize_reading_state = {"last_file_reading": "", "final_read_segment": 0}
            with self.reading_state_path.open('w') as initialize_state_write:
                json.dump(initialize_reading_state, initialize_state_write)
        else:
            print(f'{self.data_folder_path} has already had a reading state')


    def transcript_write_folder(self, transcript_folder_name):
        '''
        Transcript all audio files in the folder by transcripting its files one by ones.
        Also, save the reading state to a json file.
        '''
        data_folder = pathlib.Path(self.data_folder_path)
        audio_folder = data_folder / 'audio'
        transcript_folder = data_folder / transcript_folder_name

        assert audio_folder.exists() == True

        finished_files = []

        # create a baseline-transcript folder to store transcripted text file if it not exists
        if not transcript_folder.exists():
            transcript_folder.mkdir(parents=False)
            print('Create baseline-transcript folder')
        else:
        # if the transcript folder, save all the file names
            finished_files = [str(file.stem) for file in transcript_folder.iterdir()]

        # load the reading state and complete the unfinished audio from the final read segment + 1
        with open(self.reading_state_path, 'r+') as state_read:
            last_read_inf = json.load(state_read)
            # if the final read file has not been completed
            # transcript it starting (starting from the untranscripted segment)
            if last_read_inf["last_file_reading"] != "":
                print(f'Unfinished file: {data_folder / last_read_inf["last_file_reading"]}, segment: {last_read_inf["final_read_segment"]}')

                self.transcript_write_file(audio_folder / (last_read_inf["last_file_reading"] + '.mp3'), transcript_folder / (last_read_inf["last_file_reading"] + '.txt'), int(last_read_inf["final_read_segment"]) + 1, self.reading_state_path, self.stop_path)
                last_read_inf["last_file_reading"] = ""
                last_read_inf["final_read_segment"] = 0

                print(f'Completed unfinised file {data_folder / last_read_inf["last_file_reading"]}')
            else:
                print('No unfinished file')

        with open(self.reading_state_path, 'w') as state_write:
            json.dump(last_read_inf, state_write)

        # transcript audios in the folder
        for audio_file_path in audio_folder.iterdir():
            audio_file_name = str(audio_file_path.stem)

            # skip the audio files that has already been completed
            if  audio_file_name in finished_files:
                print(f'Already complete {audio_folder / audio_file_name}\n--------------------------------------')
                continue

            # create text file to store transcript (same name as its audio)
            transcript_file_name = str(audio_file_path.stem) + '.txt'
            transcript_folder.touch(transcript_file_name)

            # write the name of the transcripting file to the reading state
            with open(self.reading_state_path, 'w') as f:
                last_read_inf["last_file_reading"] = audio_file_name
                last_read_inf["final_read_segment"] = 0
                json.dump(last_read_inf, f)

            # transcript the file
            print(f'Transcripting {str(transcript_folder / transcript_file_name)}')
            self.transcript_write_file(str(audio_file_path), str(transcript_folder / transcript_file_name), 1, self.reading_state_path, self.stop_path)

            # if completed, update the reading state file
            with open(self.reading_state_path, 'w') as f:
                last_read_inf["last_file_reading"] = ""
                last_read_inf["final_read_segment"] = 0
                json.dump(last_read_inf, f)

            finished_files.append(audio_file_name)
            print(f'Completed {data_folder / audio_file_name}\n---------------------------------------')

        print(f'COMPLETED ALL AUDIO IN FOLDER {data_folder}')

In [None]:
from transformers.utils.hub import cached_file
from importlib.machinery import SourceFileLoader
from transformers import Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
from IPython.lib.display import Audio
import json
import time

# model_name = "nguyenvulebinh/wav2vec2-base-vi-vlsp2020"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # load model without LM
# model = SourceFileLoader("model", cached_file(model_name,filename="model_handling.py")).load_module().Wav2Vec2ForCTC.from_pretrained(model_name)
# processor = Wav2Vec2Processor.from_pretrained(model_name)

# model.to(device)

# model_inferencer = lambda input : model(**input)
# processor_feature_extractor = lambda chunks : processor.feature_extractor(chunks, sampling_rate=16000, return_tensors='pt')
# processor_tokenizer_decoder = lambda chunks :[processor.tokenizer.decode(chunks.logits[i].unsqueeze(0).argmax(dim=-1)[0].detach().cpu().numpy()) for i in range(chunks.logits.shape[0])]

# load model with LM
model_name = "nguyenvulebinh/wav2vec2-large-vi-vlsp2020"
model = SourceFileLoader("model", cached_file(model_name,filename="model_handling.py")).load_module().Wav2Vec2ForCTC.from_pretrained(model_name)
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)

model.to(device)

model_inferencer = lambda input : model(**input)
processor_feature_extractor = lambda chunks : processor.feature_extractor(chunks, sampling_rate=16000, return_tensors='pt')
processor_tokenizer_decoder = lambda chunks : [processor.decode(chunks.logits[i].unsqueeze(0).cpu().detach().numpy()[0], beam_width=100).text for i in range(chunks.logits.shape[0])]


# transcript all files in a folder
folder_transcript = FolderTranscripter(model_inferencer, processor_feature_extractor, processor_tokenizer_decoder, device, 'path to transcript_instruction.json', 'reading_state_wav.json')

folder_transcript.transcript_write_folder('baseline_transcript_wav2vec2')

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
# load model and processor
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="vietnamese", task="transcribe")

model.to(device)

processor_feature_input = lambda x : processor(x, sampling_rate=16000, return_tensors="pt").input_features

model_inferencer = lambda input_features : model.generate(input_features)
# decode token ids to text
processor_decoder = lambda pred : processor.batch_decode(pred, skip_special_tokens=True)

folder_transcript = FolderTranscripter(model_inferencer, processor_feature_input, processor_decoder, device, 'drive/MyDrive/testaudio/transcript_instruction.json', 'reading_state_whisper.json')
folder_transcript.transcript_write_folder('baseline_transcript_whisper')