In [1]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. GPU in use:", torch.cuda.get_device_name(0))
else:
    print("CUDA is NOT available. Using CPU.")

CUDA is NOT available. Using CPU.


### Importing Libraries

In [26]:
# # importing google flan-t5-small for translation refinement
# from transformers import T5Tokenizer, T5ForConditionalGeneration
# from transformers import AutoTokenizer, AutoModelForCausalLM

# c2t model loading example
import whisper_s2t

#preprocessing audio
import importlib
import overlap_split_and_preprocess
importlib.reload(overlap_split_and_preprocess)
from overlap_split_and_preprocess import split_preprocess_and_save_chunks
import librosa

# importing post processing model
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [3]:
# determine the device
device = "cuda" if torch.cuda.is_available() else "cpu"

### Loading the model

In [4]:
custom_asr_options = {
    # "max_new_tokens": 448,
    "num_beams": 5,
    "condition_on_prev_tokens": True,
    "compression_ratio_threshold": 1.3,
    "temperature": (0.0, 0.2, 0.4),
    "logprob_threshold": -0.8,
    "no_speech_threshold": 0.35,
    "return_timestamps": True,
}

ct2_model = whisper_s2t.load_model(
    model_identifier="large-v2", 
    backend='CTranslate2', 
    compute_type='int8',  # Best for cpu
    device=device,
    asr_options=custom_asr_options
)

'ffmpeg' is not built with soxr resampler, using 'swr' resampler. This may degrade performance.


  import pkg_resources


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

  @torch.cuda.amp.autocast()


### Loading Post processing model and tokenizer

In [25]:
# Load the grammar correction model and tokenizer
model_name = "grammarly/grammar-check"
post_tokenizer = AutoTokenizer.from_pretrained(model_name)
post_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set the model to evaluation mode
post_model.eval()

NameError: name 'AutoTokenizer' is not defined

### Preprocessing and chunking audio

In [5]:
# Usage example:
audio_path = "megha.mp3"
audio, sr = librosa.load(audio_path, sr=None, mono=True)
chunk_files = split_preprocess_and_save_chunks(audio,
                                            sr, chunk_duration=30,
                                            overlap_duration=5,
                                            temp_dir="temp_chunks")

Estimated speech rate: 6.24 onsets/sec
Speech rate is normal.
Original length: 1438464, Processed length: 1438464
RMS dB level: -16.39 dBFS
Estimated speech rate: 5.90 onsets/sec
Speech rate is normal.
Original length: 1440000, Processed length: 1440000
RMS dB level: -15.12 dBFS
Estimated speech rate: 6.57 onsets/sec
Speech rate is normal.
Original length: 394752, Processed length: 394752
RMS dB level: -14.48 dBFS


### Warmup Step

In [6]:
# Warmup step: run a dummy transcription on a short audio file
dummy_file = "test_short.aac"  # Path to a very short audio file (can be silence)
dummy_results = ct2_model.transcribe_with_vad(
    [dummy_file],
    lang_codes=['ml'],
    tasks=['translate'],
    initial_prompts=[None],
    batch_size=1,
)

Transcribing: 100%|██████████| 100/100 [00:38<00:00,  2.57it/s]


### Translating main audio

In [7]:
# actual inference
results = ct2_model.transcribe_with_vad(
    chunk_files,
    lang_codes=['ml'],
    tasks=['translate'],
    initial_prompts=[None],
    batch_size=16,
)

# Each element in results is a list of segments for that audio file
# Example: get all segments from all files
all_segments = []
for file_segments in results:
    all_segments.extend(file_segments)

Transcribing: 100%|██████████| 300/300 [02:56<00:00,  1.70it/s]


In [None]:
print(all_segments)


[{'text': 'Before the establishment of the IKKK, Kerala was divided into four parts. Thiruvidhamkore, Kochi, Malabar, South Kannada. The people of these villages were of the same culture, language and lifestyle. Kerala was formed by combining them. In 1920, the Nagpur Congress decided to form the committee of the organization in the language-based position. In 1921,', 'avg_logprob': -0.6828007055132577, 'no_speech_prob': 0.00013107861741445959, 'start_time': np.float64(1.38), 'end_time': 29.968}, {'text': 'The committee was formed in 1921 and the Congress Committee was established in 1928.', 'avg_logprob': -1.2479616867171393, 'no_speech_prob': 0.1292824149131775, 'start_time': np.float64(0.04), 'end_time': np.float64(18.98)}, {'text': 'The government has also announced that the government will provide the necessary funds for the construction of the Akhila Kerala Kudiyan Samayalam in 1928.', 'avg_logprob': -1.3518198889655035, 'no_speech_prob': 0.43902143836021423, 'start_time': np.flo

In [22]:
# Suppose segments is your list of segment dicts
texts = [seg["text"] for seg in all_segments if "text" in seg]
print(texts)

['Before the establishment of the IKKK, Kerala was divided into four parts. Thiruvidhamkore, Kochi, Malabar, South Kannada. The people of these villages were of the same culture, language and lifestyle. Kerala was formed by combining them. In 1920, the Nagpur Congress decided to form the committee of the organization in the language-based position. In 1921,', 'The committee was formed in 1921 and the Congress Committee was established in 1928.', 'The government has also announced that the government will provide the necessary funds for the construction of the Akhila Kerala Kudiyan Samayalam in 1928.', 'In 1928, Akhila Kerala Kudiyan Samayalam was inaugurated and the need for Kerala was met.']


In [23]:
def remove_consecutive_duplicates_texts(texts):
    cleaned = []
    prev_text = None
    for text in texts:
        text = text.strip()
        if text and text != prev_text:
            cleaned.append(text)
        prev_text = text
    return cleaned

cleaned_texts = remove_consecutive_duplicates_texts(texts)
print(cleaned_texts)

['Before the establishment of the IKKK, Kerala was divided into four parts. Thiruvidhamkore, Kochi, Malabar, South Kannada. The people of these villages were of the same culture, language and lifestyle. Kerala was formed by combining them. In 1920, the Nagpur Congress decided to form the committee of the organization in the language-based position. In 1921,', 'The committee was formed in 1921 and the Congress Committee was established in 1928.', 'The government has also announced that the government will provide the necessary funds for the construction of the Akhila Kerala Kudiyan Samayalam in 1928.', 'In 1928, Akhila Kerala Kudiyan Samayalam was inaugurated and the need for Kerala was met.']


In [None]:
from rapidfuzz import fuzz

def remove_fuzzy_duplicates_texts(texts, threshold=90):
    """
    Removes texts that are very similar to the previous kept text.
    threshold: similarity score (0-100), higher means stricter.
    """
    cleaned = []
    prev_text = ""
    for text in texts:
        t = text.strip()
        if fuzz.ratio(t, prev_text) < threshold:
            cleaned.append(t)
            prev_text = t
    return cleaned

# Usage:
fuzzy_cleaned_texts = remove_fuzzy_duplicates_texts(cleaned_texts, threshold=90)
final_text = "".join(fuzzy_cleaned_texts)
print(final_text)

Before the establishment of the IKKK, Kerala was divided into four parts. Thiruvidhamkore, Kochi, Malabar, South Kannada. The people of these villages were of the same culture, language and lifestyle. Kerala was formed by combining them. In 1920, the Nagpur Congress decided to form the committee of the organization in the language-based position. In 1921,The committee was formed in 1921 and the Congress Committee was established in 1928.The government has also announced that the government will provide the necessary funds for the construction of the Akhila Kerala Kudiyan Samayalam in 1928.In 1928, Akhila Kerala Kudiyan Samayalam was inaugurated and the need for Kerala was met.


### Further refining translated text using another llm

In [None]:
# Function to correct grammatical errors in a sentence
def correct_grammar(text):
    # Tokenize the input text
    inputs = post_tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    
    # Generate corrected text
    with torch.no_grad():
        outputs = post_model.generate(**inputs)
    
    # Decode the generated tokens back to text
    corrected_text = post_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Correct the grammar
corrected_text = correct_grammar(final_text)

print("Original Text:", final_text)
print("Corrected Text:", corrected_text)