# **Install all required Libraries**

In [None]:
!pip install sounddevice scipy
!pip install transformers
!pip install librosa
!pip install soundfile

Collecting sounddevice
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Downloading sounddevice-0.5.1-py3-none-any.whl (32 kB)
Installing collected packages: sounddevice
Successfully installed sounddevice-0.5.1


In [None]:
!git clone https://github.com/VarunGumma/IndicTransToolkit
%cd IndicTransToolkit
!pip install --editable ./
!pip install indic-nlp-library
!pip install gTTS
import os
print(os.getcwd())

Cloning into 'IndicTransToolkit'...
remote: Enumerating objects: 155, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 155 (delta 34), reused 45 (delta 30), pack-reused 95 (from 1)[K
Receiving objects: 100% (155/155), 3.88 MiB | 5.57 MiB/s, done.
Resolving deltas: 100% (62/62), done.
/content/IndicTransToolkit
Obtaining file:///content/IndicTransToolkit
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting indic-nlp-library-IT2@ git+https://github.com/VarunGumma/indic_nlp_library (from IndicTransToolkit==1.0.2)
  Cloning https://github.com/VarunGumma/indic_nlp_library to /tmp/pip-install-4kvvmq7m/indic-nlp-library-it2_e5316c52fa224c96a91635912ebd31ca
  Running command git clone --filter=blob:none --quiet https://github.com/VarunGumma/indic_nlp_library /tmp/pip-install-4kvvmq7m/indic-nlp-library-it2_e5316c52fa224c96a91635912ebd31ca
  Resolved https://github.com/VarunGumma/indic_nlp_library to commit 60

In [None]:
%cd ..

/content


# **Speech To Text Class**

In [None]:
import torch
import librosa
import soundfile as sf
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq


class AudioTranscriber:
    def __init__(self, model_name="openai/whisper-large-v2", device=None):
        """
        Initialize the AudioTranscriber with specified model.

        Args:
            model_name (str): Name or path of the Whisper model to use
            device (str): Device to run the model on ('cuda', 'cpu', or None for auto-detection)
        """
        self.model_name = model_name
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")

        # Initialize processor and model
        self.processor = None
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load the Whisper model and processor."""
        try:
            self.processor = AutoProcessor.from_pretrained(self.model_name)
            self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.model_name)
            self.model.to(self.device)
        except Exception as e:
            raise RuntimeError(f"Failed to load model: {str(e)}")

    def _load_audio(self, file_path, sample_rate=16000):
        """
        Load and preprocess audio file.

        Args:
            file_path (str): Path to the audio file
            sample_rate (int): Target sample rate for the audio

        Returns:
            numpy.ndarray: Loaded and resampled audio data
        """
        try:
            audio, _ = librosa.load(file_path, sr=sample_rate)
            return audio
        except Exception as e:
            raise RuntimeError(f"Failed to load audio file: {str(e)}")

    def transcribe(self, audio_path, return_timestamps=False):
        """
        Transcribe audio file to text.

        Args:
            audio_path (str): Path to the audio file
            return_timestamps (bool): Whether to return timestamps with the transcription

        Returns:
            str: Transcribed text
        """
        try:
            # Load and preprocess audio
            audio = self._load_audio(audio_path)

            # Prepare input features
            inputs = self.processor(
                audio,
                return_tensors="pt",
                sampling_rate=16000
            ).to(self.device)

            # Generate transcription
            with torch.no_grad():
                if return_timestamps:
                    generated = self.model.generate(
                        inputs.input_features,
                        return_timestamps=True
                    )
                else:
                    generated = self.model.generate(
                        inputs.input_features
                    )

            # Decode the output
            transcription = self.processor.batch_decode(
                generated,
                skip_special_tokens=True
            )[0]

            return transcription

        except Exception as e:
            raise RuntimeError(f"Transcription failed: {str(e)}")

    def transcribe_batch(self, audio_paths):
        """
        Transcribe multiple audio files.

        Args:
            audio_paths (list): List of paths to audio files

        Returns:
            list: List of transcribed texts
        """
        return [self.transcribe(path) for path in audio_paths]

    def __del__(self):
        """Clean up resources when the object is destroyed."""
        try:
            del self.model
            del self.processor
            torch.cuda.empty_cache()
        except:
            pass

# Using STT Class

In [None]:
transcriber = AudioTranscriber()
audio_file = ("/content/Sample02.mp3")
transcription = transcriber.transcribe(audio_file)
# transcription, language = transcriber.transcribe(audio_file, return_language=True)
print("Transcription:", transcription)
sttoutput=[transcription]

RuntimeError: Failed to load model: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 5.06 MiB is free. Process 20124 has 14.74 GiB memory in use. Of the allocated memory 14.19 GiB is allocated by PyTorch, and 430.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
torch.cuda.empty_cache()

# **Machine Translation Class**

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
from IndicTransToolkit.processor import IndicProcessor



class IndicTranslator:
    def __init__(self, checkpoint_dir="ai4bharat/indictrans2-indic-indic-1B", batch_size=4, quantization=None):
        """
        Initialize the IndicTranslator with model configuration.

        Args:
            checkpoint_dir (str): Path to the model checkpoint
            batch_size (int): Size of batches for translation
            quantization (str): Quantization type ('4-bit', '8-bit', or None)
        """
        self.checkpoint_dir = checkpoint_dir
        self.batch_size = batch_size
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.processor = IndicProcessor(inference=True)
        self.tokenizer, self.model = self._initialize_model_and_tokenizer(quantization)

    def _initialize_model_and_tokenizer(self, quantization):
        """Initialize and configure the model and tokenizer."""
        if quantization == "4-bit":
            qconfig = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
            )
        elif quantization == "8-bit":
            qconfig = BitsAndBytesConfig(
                load_in_8bit=True,
                bnb_8bit_use_double_quant=True,
                bnb_8bit_compute_dtype=torch.bfloat16,
            )
        else:
            qconfig = None

        tokenizer = AutoTokenizer.from_pretrained(self.checkpoint_dir, trust_remote_code=True)
        model = AutoModelForSeq2SeqLM.from_pretrained(
            self.checkpoint_dir,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
            quantization_config=qconfig,
        )

        if qconfig is None:
            model = model.to(self.device)
            if self.device == "cuda":
                model.half()

        model.eval()
        return tokenizer, model

    def translate(self, texts, source_lang, target_lang):
        """
        Translate a list of texts from source language to target language.

        Args:
            texts (list): List of input texts to translate
            source_lang (str): Source language code (e.g., 'tam_Taml')
            target_lang (str): Target language code (e.g., 'hin_Deva')

        Returns:
            list: List of translated texts
        """
        translations = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i: i + self.batch_size]

            # Preprocess the batch
            batch = self.processor.preprocess_batch(batch, src_lang=source_lang, tgt_lang=target_lang)

            # Tokenize
            inputs = self.tokenizer(
                batch,
                truncation=True,
                padding="longest",
                return_tensors="pt",
                return_attention_mask=True,
            ).to(self.device)

            # Generate translations
            with torch.no_grad():
                generated_tokens = self.model.generate(
                    **inputs,
                    use_cache=True,
                    min_length=0,
                    max_length=256,
                    num_beams=5,
                    num_return_sequences=1,
                )

            # Decode translations
            with self.tokenizer.as_target_tokenizer():
                generated_tokens = self.tokenizer.batch_decode(
                    generated_tokens.detach().cpu().tolist(),
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True,
                )

            # Postprocess translations
            translations.extend(self.processor.postprocess_batch(generated_tokens, lang=target_lang))

            del inputs
            torch.cuda.empty_cache()

        return translations

    def __del__(self):
        """Clean up resources when the object is destroyed."""
        try:
            del self.tokenizer
            del self.model
            torch.cuda.empty_cache()
        except:
            pass

# **Using MT Class**

In [None]:
translator = IndicTranslator()
# texts = ['என் பெயர் விக்னேஷ் நான் பங்க்குலூரில் வசிக்கிறேன்']
texts= sttoutput
lang_dict = {'Assamese': 'asm_Beng', 'Bengali': 'ben_Beng', 'Bodo': 'brx_Deva', 'Dogri': 'doi_Deva', 'English': 'eng_Latn',
             'Konkani': 'gom_Deva', 'Gujarati': 'guj_Gujr', 'Hindi': 'hin_Deva', 'Kannada': 'kan_Knda', 'Kashmiri (Arabic)': 'kas_Arab',
             'Kashmiri (Devanagari)': 'kas_Deva', 'Maithili': 'mai_Deva','Telugu': 'tel_Telu','Tamil':'tam_Taml'}
source_lang = lang_dict['Kannada']
target_lang = lang_dict['Tamil']
translations = translator.translate(texts, source_lang, target_lang)
print(translations)

['மருத்துவமனை எங்கே? ']


# **TEXT TO SPEECH**

In [None]:
from gtts import gTTS
import os

#translations="నా పేరు విఘ్నేష్ నేను బంకులూర్లో నివసిస్తున్నాను. "
# translations="என் பெயர் விக்னேஷ் நான் பங்க்குலூரில் வசிக்கிறேன்"
indian_languages = {
    'asm_Beng': 'as',    'ben_Beng': 'bn',    'brx_Deva': 'brx',    'doi_Deva': 'doi',    'gom_Deva': 'gom',    'guj_Gujr': 'gu',    'hin_Deva': 'hi',   'kan_Knda': 'kn',    'kas_Arab': 'ks',
    'kas_Deva': 'ks',    'mai_Deva': 'mai',    'tel_Telu': 'te',    'tam_Taml': 'ta'}

translation = translations[0]
language = indian_languages[f'{target_lang}']
speech = gTTS(text=translation, lang=language, slow=False)
speech.save(f"output_{language}.mp3")
os.system("start output.mp3")

32512