# Generating text transcriptions for the dataset with GigaAM

_Though GigaAM transcriptions are precise in terms of acoustic features, they have no punctuation and are case-insensitive. We will use these transcriptions as the source of best available autogenerated transcriptions and enhance them with capitalization and capitalization using SileroTE model._

In [None]:
from warnings import filterwarnings
filterwarnings("ignore") # Turn warnings off just for prettier cells output

In [None]:
# Import all necessary libraries and functions

from tqdm.notebook import tqdm
from os import listdir, makedirs, path
from torch.hub import load as hub_load
from torch.cuda import empty_cache as clear_cuda_cache

from GigaAM.gigaam import load_model

**Note: the code in GigaAM folder is basicly just a copy of the [GigaAM fork](https://github.com/KalininVD/GigaAM-upgraded.git), so make sure you have installed it properly (with `pip install ...`, see `README.md` in the repo)**

Loading the main STT model

In [None]:
stt_model = load_model(
    model_name="v2_rnnt", # v2_rnnt is the best available model for now (march 2025)
    fp16_encoder=False,
    device="cuda", # use `cpu` in case of no CUDA-supporting GPU available
)

Set up audios and transcriptions paths:

In [None]:
main_audio_path = "<audio_path>" # Absolute path to the directory with the audio files
main_text_path = "<transcriptions_path>" # Absolute path to the directory with the transcriptions (should be empty or not existing)

### Perform the transcribe process

In [None]:
for subject in tqdm(sorted(listdir(main_audio_path))): # Use tqdm to show progress
    audio_subj_path = path.join(main_audio_path, subject)
    text_subj_path = path.join(main_text_path, subject)

    makedirs(text_subj_path, exist_ok=True)

    for file in tqdm(sorted(listdir(audio_subj_path))): # Use tqdm to show progress
        audio_path = path.join(audio_subj_path, file)
        text_path = path.join(text_subj_path, file.replace(".wav", ".txt")) # Make sure to have exactly WAV files as audio sources!

        recognition_result = stt_model.transcribe_longform(audio_path)
        transcription = " ".join(utterance["transcription"] for utterance in recognition_result)

        with open(text_path, "w", encoding="utf-8") as f:
            f.write(transcription)

Clear up memory

In [None]:
del stt_model

clear_cuda_cache()

### Enhance transcriptions (adding capitalization&punctuation)

In [None]:
te_model, example_texts, languages, punct, apply_te = hub_load(
    repo_or_dir='snakers4/silero-models',
    model='silero_te',
) # Load Silero TE model

Helper function wrapping the TE model call

In [None]:
def enhance_transcription(transcription: str) -> str:
    "Enhances transcription by adding punctuation and capitalization."

    transcription = transcription.strip().lower()

    while "  " in transcription:
        transcription = transcription.replace("  ", " ")
    
    return apply_te(transcription, lan='ru')

In [None]:
text_path = "<enhanced_transcriptions_path>" # Path to the enhanced transcriptions directory

In [None]:
for subject in tqdm(sorted(listdir(main_text_path))): # Again use tqdm to show progress
    text_in_path = path.join(main_text_path, subject)
    text_out_path = path.join(text_path, subject)

    makedirs(text_out_path, exist_ok=True)

    for file in tqdm(sorted(listdir(text_in_path))): # Again use tqdm to show progress
        in_path = path.join(text_in_path, file)
        out_path = path.join(text_out_path, file)

        with open(in_path, "r", encoding="utf-8") as f_in:
            transcription = f_in.read()

        enhanced_transcription = enhance_transcription(transcription)

        with open(out_path, "w", encoding="utf-8") as f_out:
            f_out.write(enhanced_transcription)