In [None]:
!pip install sounddevice transformers torchaudio scipy playsound google-generativeai soundfile pydub pygobject

Collecting sounddevice
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting playsound
  Downloading playsound-1.3.0.tar.gz (7.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading sounddevice-0.5.1-py3-none-any.whl (32 kB)
Building wheels for collected packages: playsound
  Building wheel for playsound (setup.py) ... [?25l[?25hdone
  Created wheel for playsound: filename=playsound-1.3.0-py3-none-any.whl size=7020 sha256=e986d367148ffb2b38045c887d2aba9411471052eb490f3610be4ec371e2d98d
  Stored in directory: /root/.cache/pip/wheels/90/89/ed/2d643f4226fc8c7c9156fc28abd8051e2d2c0de37ae51ac45c
Successfully built playsound
Installing collected packages: playsound, sounddevice
Successfully installed playsound-1.3.0 sounddevice-0.5.1


In [None]:
import numpy as np
import soundfile as sf
import torch
import torchaudio
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoModelForTextToWaveform
from scipy.io.wavfile import write as write_wav
import google.generativeai as genai
#Instead of playsound, use pydub's AudioSegment
from pydub import AudioSegment
from pydub.playback import play
import time
from google.colab import files

# Function to upload audio file
def upload_audio():
    uploaded = files.upload()
    for filename in uploaded.keys():
        print(f"Uploaded file: {filename}")
        return filename

# Load Whisper model
def load_whisper_model(model_id="openai/whisper-large-v2"):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    print(f"Running on {device.upper()}")
    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype)
    processor = AutoProcessor.from_pretrained(model_id)

    model.to(device)

    # Create a pipeline for speech recognition
    return pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
    )

# Speech-to-text conversion
def speech_to_text(audio_file):
    pipe = load_whisper_model()
    with torch.no_grad():
        result = pipe(audio_file)
    return result["text"]

# Google Gemini API for grammar correction
class GoogleGeminiAPI:
    def __init__(self, api_key, model_name="models/gemini-1.0-pro-latest"):
        self.api_key = api_key
        self.model_name = model_name
        # Initialize the model attribute to None
        self.model = None

    def gcp_config(self):
        genai.configure(api_key=self.api_key)

    def load_model(self):
        self.gcp_config()
        self.model = genai.GenerativeModel(self.model_name)

    def generate_text(self, prompt, text):
        if not self.model:
            self.load_model()
        response = self.model.generate_content(prompt + text)
        return response.text

def correct_grammar(api_key, text):
    prompt = '''You are a highly knowledgeable language teacher with in-depth expertise in multiple languages, including their grammar, pronunciation, and vocabulary. Your role is to teach language learners how to speak fluently and accurately in any language they choose...
        '''
    gemini_api = GoogleGeminiAPI(api_key)
    return gemini_api.generate_text(prompt, text)

# Text-to-speech conversion using Bark
def text_to_speech_bark(text, output_file="speech.wav", amplification_db=10):
    # Load Bark model and processor
    processor = AutoProcessor.from_pretrained("suno/bark")
    model = AutoModelForTextToWaveform.from_pretrained("suno/bark")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = processor(text, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        audio_array = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]).cpu().numpy().squeeze()

    # Save generated audio to a file
    write_wav(output_file, 24000, audio_array)

    # Load the audio file with pydub
    sound = AudioSegment.from_wav(output_file)

    # Amplify the audio by a specified gain in dB
    amplified_sound = sound.apply_gain(amplification_db)

    # Play the amplified sound
    play(amplified_sound)

# Main process flow
def main(api_key):
    total_start_time = time.time()

    # Step 1: Upload audio
    print("Please upload your audio file (WAV format)")
    start_time = time.time()
    audio_file = upload_audio()  # Upload the audio file
    print(f"Audio uploaded in {time.time() - start_time:.2f} seconds.")

    # Step 2: Convert uploaded audio to text
    print("Converting speech to text...")
    start_time = time.time()
    transcribed_text = speech_to_text(audio_file)
    print(f"Transcribed Text: {transcribed_text}")
    print(f"Speech to text completed in {time.time() - start_time:.2f} seconds.")

    # Step 3: Correct grammar and pronunciation
    print("Correcting grammar and pronunciation...")
    start_time = time.time()
    corrected_text = correct_grammar(api_key, transcribed_text)
    print(f"Corrected Text: {corrected_text}")
    print(f"Grammar correction completed in {time.time() - start_time:.2f} seconds.")

    # Step 4: Convert corrected text to speech
    print("Converting corrected text to speech...")
    start_time = time.time()
    output_audio_file = "final_output.wav"
    text_to_speech_bark(corrected_text, output_file=output_audio_file)
    print(f"Final speech saved to {output_audio_file}")
    print(f"Text to speech completed in {time.time() - start_time:.2f} seconds.")

    total_end_time = time.time()
    print(f"Total process completed in {total_end_time - total_start_time:.2f} seconds.")

if __name__ == "__main__":
    api_key = "AIzaSyBCW-TszvSeBUqHd2Ap7gpnjaVUG5BAlx0"  # Replace with your Google Gemini API key
    main(api_key)


Please upload your audio file (WAV format)


Saving Recording (5).mp3 to Recording (5) (3).mp3
Uploaded file: Recording (5) (3).mp3
Audio uploaded in 12.58 seconds.
Converting speech to text...
Running on CUDA:0




Transcribed Text:  Yesterday I go to the store to buy some vegetables and tomatoes.
Speech to text completed in 32.36 seconds.
Correcting grammar and pronunciation...
Corrected Text: Yesterday I went to the store to buy some vegetables and tomatoes.

**Explanation:**
The verb "go" is in the present tense, so it should be changed to "went" to match the past tense of "yesterday."
Grammar correction completed in 4.79 seconds.
Converting corrected text to speech...


  WeightNorm.apply(module, name, dim)
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


Final speech saved to final_output.wav
Text to speech completed in 87.54 seconds.
Total process completed in 137.27 seconds.
