In [None]:
!pip install --upgrade pip
!pip install --upgrade transformers datasets[audio] accelerate

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.2
Collecting transformers
  Downloading transformers-4.45.0-py3-none-any.whl.metadata (44 kB)
Collecting datasets[audio]
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pyarrow>=15.0.0 (from datasets[audio])
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets[audio])
  Downloading xxhash-3.5.0-cp310-cp

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)
result = pipe("/content/Recording (3).mp3")
print(result["text"])

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


 Yesterday I got to the store to buy some vegetables and tomatoes. The cashier said I don't have enough money so I leave without buying anything.


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("grammarly/coedit-large")
model = AutoModelForSeq2SeqLM.from_pretrained("grammarly/coedit-large").cuda()  # Move model to GPU

def correct_grammar(text):
    # Add a more specific prompt
    prompt = (
        '''
        You are a highly knowledgeable language teacher with in-depth expertise in multiple languages, including their grammar, pronunciation, and vocabulary. Your role is to teach language learners how to speak fluently and accurately in any language they choose. You will explain language rules, guide pronunciation, and provide examples to help learners improve.
        Whenever the learner makes a mistake—whether grammatical, pronunciation-related, or structural—you will gently correct them, explain the error, and provide the correct way to say it. You will also offer helpful tips to improve their language skills, tailoring your feedback to their level of expertise. Act as a supportive tutor, fostering confidence and encouraging progress. **Do not change the meaning of the original text.**
        '''
    )

    input_text = prompt + text  # Concatenate the prompt with the input text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to('cuda')  # Move tensors to GPU

    # Increase max_length and other parameters to avoid early stopping
    outputs = model.generate(
        **inputs,
        max_length=512,  # Increase the maximum number of tokens to generate
        no_repeat_ngram_size=2,  # Avoid repeating n-grams
        early_stopping=True
    )

    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example text with errors
text = result["text"]

# Correct the grammar with the custom prompt
corrected_text = correct_grammar(text)
print("Corrected text:", corrected_text)


Corrected text: Yesterday I went to the store to buy some vegetables and tomatoes. The cashier said I didn't have enough money so I left without buying anything.


In [None]:
!pip install --upgrade pip
!pip install --upgrade transformers sentencepiece datasets[audio]

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.2
Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
Collecting datasets[audio]
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pyarrow>=15.0.0 (from datasets[audio])
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets[audio])
  Downloading xxhash-3.5.0-cp310-cp

In [None]:
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf

synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# You can replace this embedding with your own as well.

speech = synthesiser(corrected_text, forward_params={"speaker_embeddings": speaker_embedding})

sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
!pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.1.tar.gz (63.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.1-cp310-cp310-linux_x86_64.whl size=3485345 sha256=8e70a1edac47fda1262048b0991d53e9216428f0997bd3db1a270cb6f9ee8efe
  Stored in directory: /root/.cache/pip/wheels/f8/b0/a2/f47d952aec7ab061b9e2a34

In [None]:
from llama_cpp import Llama
import torch

# Load the Llama model from the pre-trained GGUF file
llm = Llama.from_pretrained(
    repo_id="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
    filename="Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf",  # Ensure the GGUF file path is correct
    # Specify the device during model loading
    n_gpu=torch.cuda.device_count() if torch.cuda.is_available() else 0
)


# Define a specific prompt for grammar correction
prompt = (
        '''
        You are a highly knowledgeable language teacher with in-depth expertise in multiple languages, including their grammar, pronunciation, and vocabulary. Your role is to teach language learners how to speak fluently and accurately in any language they choose. You will explain language rules, guide pronunciation, and provide examples to help learners improve.
        Whenever the learner makes a mistake—whether grammatical, pronunciation-related, or structural—you will gently correct them, explain the error, and provide the correct way to say it. You will also offer helpful tips to improve their language skills, tailoring your feedback to their level of expertise. Act as a supportive tutor, fostering confidence and encouraging progress. make sure to give the output in json format
        '''
)

# Define the input text with errors
input_text = "I am having two brothers and one sister. My elder brother is working in an IT company and my younger brother is studying in school. I am liking to read books and playing cricket in my free time. Yesterday, I did a party with my friends for celebrating my birthday. It was a good fun, and we enjoyed a lot."

# Create the complete chat input
full_input = prompt + input_text

# Create a chat completion by providing messages
response = llm.create_chat_completion(
    messages=[
        {
            "role": "user",
            "content": full_input
        }
    ]
)

# Assuming 'response' is the dictionary from the Llama model completion
generated_response = response['choices'][0]['message']['content']

# Print the actual response content
print("Generated response:", generated_response)


llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from /root/.cache/huggingface/hub/models--lmstudio-community--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/8601e6db71269a2b12255ebdf09ab75becf22cc8/./Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str             

In [None]:
from llama_cpp import Llama
import torch

# Load the Llama model from the pre-trained GGUF file
llm = Llama.from_pretrained(
    repo_id="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
    filename="Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf",  # Ensure the GGUF file path is correct
    # Specify the device during model loading
    n_gpu=torch.cuda.device_count() if torch.cuda.is_available() else 0
)

# Define the input text with errors
input_text = "I am having two brothers and one sister. My elder brother is working in an IT company and my younger brother is studying in school. I am liking to read books and playing cricket in my free time. Yesterday, I did a party with my friends for celebrating my birthday. It was a good fun, and we enjoyed a lot."

# Create the complete instruction for grammar correction
instruction = (
    "You are a knowledgeable language teacher. Correct the grammar and enhance the clarity of the following text. "
    "Make sure to give the output in stringified JSON format."
)

# Create the complete chat input
full_input = instruction + "\n" + input_text

# Create a chat completion by providing messages
response = llm.create_chat_completion(
    messages=[
        {
            "role": "user",
            "content": full_input
        }
    ]
)

# Assuming 'response' is the dictionary from the Llama model completion
generated_response = response['choices'][0]['message']['content']

# Print the actual response content
print("Generated response:", generated_response)


llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from /root/.cache/huggingface/hub/models--lmstudio-community--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/8601e6db71269a2b12255ebdf09ab75becf22cc8/./Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str             

Generated response: ### Corrected Text in JSON Format

```json
{
  "I": {
    "have": "two brothers and one sister.",
    "My elder brother": {
      "is working": "in an IT company."
    },
    "My younger brother": {
      "is studying": "in school."
    },
    "enjoy": {
      "reading books": "and playing cricket",
      "in my free time."
    },
    "Yesterday": {
      "I celebrated": "my birthday with my friends by hosting a party.",
      "It was": {
        "a good time",
        "and we had a lot of fun."
      }
    }
  }
}
```

Explanation of Changes:

- Changed "having two brothers" to "have two brothers" to use the correct verb form.
- Changed "I am liking" to "I enjoy" to use the correct verb form and phrasing for expressing a preference or hobby.
- Changed "playing cricket in my free time" to "reading books and playing cricket in my free time" to maintain parallel structure.
- Changed "I did a party" to "I celebrated my birthday with my friends by hosting a party" to us

In [None]:
import google.generativeai as palm

# Set your API key (replace 'YOUR_API_KEY' with the actual key)
palm.configure(api_key='AIzaSyBCW-TszvSeBUqHd2Ap7gpnjaVUG5BAlx0')

def correct_grammar(text):
    # Add a more specific prompt
    prompt = (
        '''
        You are a highly knowledgeable language teacher with in-depth expertise in multiple languages, including their grammar, pronunciation, and vocabulary. Your role is to teach language learners how to speak fluently and accurately in any language they choose. You will explain language rules, guide pronunciation, and provide examples to help learners improve.
        Whenever the learner makes a mistake—whether grammatical, pronunciation-related, or structural—you will gently correct them, explain the error, and provide the correct way to say it. You will also offer helpful tips to improve their language skills, tailoring your feedback to their level of expertise. Act as a supportive tutor, fostering confidence and encouraging progress. **Do not change the meaning of the original text.**
        '''
    )

    input_text = prompt + text  # Concatenate the prompt with the input text

    # Call Google Gemini API (PaLM)
    response = palm.generate_text(
        model="models/text-bison-001",  # Added the 'models/' prefix
        prompt=input_text,
        temperature=0.7,  # Adjust the temperature to control randomness
        max_output_tokens=512,  # Adjust max tokens for larger output
        stop_sequences=["\n"]  # Optional stop sequence to end generation
    )

    # Extract the corrected text
    corrected_text = response.result
    return corrected_text

# Example text with errors
text = "I am having two brothers and one sister. My elder brother is working in an IT company and my younger brother is studying in school. I am liking to read books and playing cricket in my free time. Yesterday, I did a party with my friends for celebrating my birthday. It was a good fun, and we enjoyed a lot."

# Correct the grammar using Google Gemini API
corrected_text = correct_grammar(text)
print("Corrected text:", corrected_text)



NotFound: 404 POST https://generativelanguage.googleapis.com/v1beta/models/text-bison-001:generateText?%24alt=json%3Benum-encoding%3Dint: Requested entity was not found.

In [None]:
import google.generativeai as palm

# Set your API key (replace 'YOUR_API_KEY' with the actual key)
palm.configure(api_key='AIzaSyBCW-TszvSeBUqHd2Ap7gpnjaVUG5BAlx0')

def correct_grammar(text):
    # Add a more specific prompt
    prompt = (
        '''
        You are a highly knowledgeable language teacher with in-depth expertise in multiple languages, including their grammar, pronunciation, and vocabulary. Your role is to teach language learners how to speak fluently and accurately in any language they choose. You will explain language rules, guide pronunciation, and provide examples to help learners improve.
        Whenever the learner makes a mistake—whether grammatical, pronunciation-related, or structural—you will gently correct them, explain the error, and provide the correct way to say it. You will also offer helpful tips to improve their language skills, tailoring your feedback to their level of expertise. Act as a supportive tutor, fostering confidence and encouraging progress. **Do not change the meaning of the original text.**
        '''
    )

    input_text = prompt + text  # Concatenate the prompt with the input text

    # Call Google Gemini API (PaLM)
    response = palm.generate_text(
        model="models/gemini-1.0-pro-latest",  # Changed to a supported model
        prompt=input_text,
        temperature=0.7,  # Adjust the temperature to control randomness
        max_output_tokens=512,  # Adjust max tokens for larger output
        stop_sequences=["\n"]  # Optional stop sequence to end generation
    )

    # Extract the corrected text
    corrected_text = response.result
    return corrected_text

# Example text with errors
text = "I am having two brothers and one sister. My elder brother is working in an IT company and my younger brother is studying in school. I am liking to read books and playing cricket in my free time. Yesterday, I did a party with my friends for celebrating my birthday. It was a good fun, and we enjoyed a lot."

# Correct the grammar using Google Gemini API
corrected_text = correct_grammar(text)
print("Corrected text:", corrected_text)



BadRequest: 400 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.0-pro-latest:generateText?%24alt=json%3Benum-encoding%3Dint: * GenerateTextRequest.model: unexpected model name format


In [None]:
import google.generativeai as genai

class GoogleGeminiAPI:
    def __init__(self, api_key, model_name="models/gemini-1.0-pro-latest"):
        self.api_key = api_key
        self.model_name = model_name
        self.model = None

    def gcp_config(self):
        try:
            genai.configure(api_key=self.api_key)
        except Exception as e:
            print("gcp_config:: ", e)

    def list_available_models(self):
        try:
            models = genai.list_models()
            for model in models:
                print(f"Model ID: {model.modelId}, Model Name: {model.displayName}")
        except Exception as e:
            print("list_available_models:: ", e)

    def load_model(self):
        try:
            self.gcp_config()
            self.model = self.model_name
        except Exception as e:
            print("load_model:: ", e)

    def generate_text(self, prompt):
        try:
            if not self.model:
                self.load_model()

            response = genai.generate_text(
                model=self.model_name,
                prompt=prompt,
                temperature=0.7,
                max_output_tokens=512
            )
            return response.result
        except Exception as e:
            print("generate_text:: ", e)
            return None

# Example usage
if __name__ == "__main__":
    api_key = "AIzaSyBCW-TszvSeBUqHd2Ap7gpnjaVUG5BAlx0"  # Replace with your actual API key
    text = "I am having two brothers and one sister. My elder brother is working in an IT company and my younger brother is studying in school. I am liking to read books and playing cricket in my free time. Yesterday, I did a party with my friends for celebrating my birthday. It was a good fun, and we enjoyed a lot."

    # Initialize the GoogleGeminiAPI class
    gemini_api = GoogleGeminiAPI(api_key, model_name="models/gemini-1.0-pro")

    # List available models to verify that the correct model is available
    print("Available models:")
    gemini_api.list_available_models()

    # Generate corrected text using the provided input
    prompt = f"""
    You are a highly knowledgeable language teacher with in-depth expertise in multiple languages, including their grammar, pronunciation, and vocabulary.
    Correct the following text and provide an explanation for the changes made: '{text}'
    """

    # Generate the corrected version of the text
    corrected_text = gemini_api.generate_text(prompt + text)

    # Output the corrected text
    if corrected_text:
        print("Corrected text:", corrected_text)


Available models:
list_available_models::  'Model' object has no attribute 'modelId'
generate_text::  400 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.0-pro:generateText?%24alt=json%3Benum-encoding%3Dint: * GenerateTextRequest.model: unexpected model name format





In [None]:
import google.generativeai as genai

class GoogleGeminiAPI:
    def __init__(self, api_key, model_name="models/gemini-1.0-pro-latest"):
        self.api_key = api_key
        self.model_name = model_name
        self.model = None

    # Configure GCP with the provided API key
    def gcp_config(self):
        try:
            genai.configure(api_key=self.api_key)
        except Exception as e:
            print(f"An error occurred: {e}")

    def list_available_models(self):
        try:
            models = genai.list_models()
            for model in models:
                print(f"Model ID: {model.name}, Model Name: {model.displayName}")
        except Exception as e:
            print("list_available_models:: ", e)

    # Load the model from Google Gemini
    def load_model(self):
        try:
            self.gcp_config()
            self.model = genai.GenerativeModel(self.model_name)
        except Exception as e:
            print(f"An error occurred: {e}")

    # Method to call the API and generate content
    def generate_text(self, prompt):
        try:
            if not self.model:
                self.load_model()

            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            print("generate_text:: ", e)
            return None

# Example usage
if __name__ == "__main__":
    api_key = "AIzaSyBCW-TszvSeBUqHd2Ap7gpnjaVUG5BAlx0"  # Replace with your actual API key
    model_name = "models/gemini-1.0-pro-latest"
    prompt = (
        '''
        You are a highly knowledgeable language teacher with in-depth expertise in multiple languages, including their grammar, pronunciation, and vocabulary. Your role is to teach language learners how to speak fluently and accurately in any language they choose. You will explain language rules, guide pronunciation, and provide examples to help learners improve.
        Whenever the learner makes a mistake—whether grammatical, pronunciation-related, or structural—you will gently correct them, explain the error, and provide the correct way to say it. You will also offer helpful tips to improve their language skills, tailoring your feedback to their level of expertise. Act as a supportive tutor, fostering confidence and encouraging progress. **Do not change the meaning of the original text.make sure to give the output in json format**
        '''
    )

    text = "I am having two brothers and one sister. My elder brother is working in an IT company and my younger brother is studying in school. I am liking to read books and playing cricket in my free time. Yesterday, I did a party with my friends for celebrating my birthday. It was a good fun, and we enjoyed a lot."


    # Initialize the GoogleGeminiAPI class
    gemini_api = GoogleGeminiAPI(api_key, model_name)

    # Generate text using the provided prompt
    corrected_text = gemini_api.generate_text(prompt + text)

    # Output the corrected text
    if corrected_text:
        print("Corrected text:", corrected_text)

Corrected text: ```json
{
  "correctedText": "I have two brothers and one sister. My elder brother works in an IT company and my younger brother studies in school. I enjoy reading books and playing cricket in my free time. Yesterday, I had a party with my friends to celebrate my birthday. It was great fun, and we enjoyed ourselves a lot.",
  "corrections": [
    {
      "original": "I am having",
      "corrected": "I have",
      "errorType": "grammar",
      "explanation": "The present continuous tense is used to describe actions that are happening now or over a period of time. In this case, the action of having siblings is a permanent state, so we should use the simple present tense."
    },
    {
      "original": "is working",
      "corrected": "works",
      "errorType": "grammar",
      "explanation": "Since the subject (My elder brother) is in the third person singular, the verb should be in the third person singular form."
    },
    {
      "original": "I am liking",
      "

In [None]:
import google.generativeai as genai

def list_available_models(api_key):
    try:
        # Configure GCP API key
        genai.configure(api_key=api_key)

        # List available models
        models = genai.list_models()
        available_models = []

        # Collect the models with their IDs and descriptions
        for model in models:
            model_info = {
                "Model ID": model.model_id,  # Corrected attribute access
                "Display Name": model.display_name,  # Corrected attribute access
                "Description": model.description if hasattr(model, 'description') else 'No description available'  # Corrected for missing description
            }
            available_models.append(model_info)

        return available_models
    except Exception as e:
        return f"Error listing models: {e}"

# Example usage:
api_key = "YOUR_GCP_API_KEY"  # Replace with your actual API key
available_models = list_available_models(api_key)

# Output the available models
for model in available_models:
    print(f"Model ID: {model['Model ID']}, Display Name: {model['Display Name']}, Description: {model['Description']}")




TypeError: string indices must be integers

In [None]:
!pip install speechbrain

Collecting speechbrain
  Downloading speechbrain-1.0.1-py3-none-any.whl.metadata (24 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain)
  Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Downloading speechbrain-1.0.1-py3-none-any.whl (807 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m807.2/807.2 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)
Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ruamel.yaml.c

In [None]:
import torch
from speechbrain.lobes.models.ECAPA_TDNN import ECAPA_TDNN
from speechbrain.nnet.CNN import Conv1d

class MyECAPA_TDNN(ECAPA_TDNN):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def forward(self, x, lengths=None):
        """Processes the input tensor x and returns an output tensor."""
        x = x.transpose(1, 2)
        xl = []
        for layer in self.layers:
            try:
                x = layer(x)
            except TypeError:
                x = layer(x)
            xl.append(x)

        # Concatenate the layers
        x = torch.cat(xl, dim=2)
        x = x.transpose(1, 2)
        x = self.norm(x)
        return x


import torchaudio
from speechbrain.pretrained import EncoderClassifier

# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained model for speaker embeddings and move it to GPU
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp")

# Replace the original ECAPA_TDNN with your custom one
# Access n_mels from the correct object
n_mels = classifier.hparams.compute_features.n_mels
classifier.mods.embedding_model = MyECAPA_TDNN(
    input_size=n_mels, # Access n_mels directly
    lin_neurons=192,
)

classifier = classifier.to(device)  # Move model to GPU if available

# Function to extract embeddings from audio and use GPU
def extract_voice_embedding(file_path):
    # Load audio file (MP3 works if torchaudio has proper backend)
    signal, fs = torchaudio.load(file_path)

    # Move audio signal to GPU if available
    signal = signal.to(device) # Moved signal to the device

    # Compute the embeddings (on GPU)
    embeddings = classifier.encode_batch(signal) # Removed lengths argument

    # Convert tensor to numpy array and move to CPU before returning
    embedding_np = embeddings.squeeze().detach().cpu().numpy()

    return embedding_np

# Path to your MP3 audio file
file_path = "/content/Recording (3).mp3"

# Extract voice embeddings
voice_embedding = extract_voice_embedding(file_path)

# Output the voice embedding
print("Voice Embedding Shape:", voice_embedding.shape)
print("Voice Embedding:", voice_embedding)

  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)


AttributeError: 'Fbank' object has no attribute 'n_mels'

In [None]:
from TTS.api import TTS

# Initialize the YourTTS model for zero-shot voice cloning
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", gpu=True)

# Define the text and load a short sample of the user's voice
text = "Yesterday I got to the store to buy some vegetables and tomatoes. The cashier said I don't have enough money so I leave without buying anything."
speaker_wav = "/content/Recording (3).mp3"

# Generate speech with the user's voice
# Added the language argument
tts.tts_to_file(text=text, speaker_wav=speaker_wav, language="en", file_path="output.wav") # You can change "en" to your desired language code.



 > tts_models/multilingual/multi-dataset/your_tts is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:True
 | > db_level:-27.0
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400
 > External Speaker Encoder Loaded !!
 > initialization of language-embedding layers.
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10

'output.wav'

In [None]:
!pip install TTS
!pip install torchaudio

Collecting TTS
  Downloading TTS-0.22.0-cp310-cp310-manylinux1_x86_64.whl.metadata (21 kB)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting umap-learn>=0.5.1 (from TTS)
  Downloading umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting pandas<2.0,>=1.4 (from TTS)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting trainer>=0.0.32 (from TTS)
  Downloading trainer-0.0.36-py3-none-any.whl.metadata (8.1 kB)
Collecting coqpit>=0.0.16 (from TTS)
  Downloading coqpit-0.0.17-py3-none-any.whl.metadata (11 kB)
Collecting pypinyin (from TTS)
  Downloading pypinyin-0.53.0-py2.py3-none-any.whl.metadata (12 kB)
Collecting hangul-romanize (from TTS)
  Downloading hangul_romanize-0.1.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gruut==2.2.3 (from gruut[de,es,fr]==2.2.3->T

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3070, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2863, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 447, in run
    conflicts = self._determine_conflicts(to_install)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 5

In [None]:
# Instead of
# from TTS.utils.downloaders import download_model

from TTS.utils.manage import ModelManager # Import ModelManager instead of download_model

import os

# Define custom directories for saving the models
custom_model_dir = "/content/"

# Ensure the directory exists
os.makedirs(custom_model_dir, exist_ok=True)

# Initialize ModelManager
model_manager = ModelManager()

# Download and save the speaker encoder model in the custom directory
# Changed from "encoder/speaker_embedding/pretrained" to "speaker_embedding/libri_tts"
encoder_model_path, config_path = model_manager.download_model("speaker_embedding/libri_tts")

# Download and save the TTS model in the custom directory
tts_model_path, config_path = model_manager.download_model("tts_models/en/vctk/vits")


print(f"Encoder model saved at: {encoder_model_path}")
print(f"TTS model saved at: {tts_model_path}")

ValueError: not enough values to unpack (expected 4, got 2)

In [None]:
import torch
from TTS.utils.synthesizer import Synthesizer
from TTS.tts.utils.speakers import SpeakerManager
import torchaudio

# Step 1: Load the pre-trained TTS model and speaker encoder
tts_model_path = "/root/.local/share/tts/tts_models--en--vctk--vits"
encoder_model_path = "/root/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC"

# Initialize the TTS synthesizer
# Removed the speaker_embedding argument. If you are using an older version of TTS, this argument may not be supported.
synthesizer = Synthesizer(tts_model_path)

# Initialize the speaker manager for extracting embeddings
speaker_manager = SpeakerManager(encoder_model_path)

# Step 2: Load a user's voice sample (e.g., short recording)
signal, sample_rate = torchaudio.load("Recording (3).mp3")

# Convert the signal to single-channel (if needed)
if signal.shape[0] > 1:
    signal = torch.mean(signal, dim=0, keepdim=True)

# Step 3: Extract the speaker embedding from the voice sample
embedding = speaker_manager.compute_embedding(signal, sample_rate)

# Save the extracted speaker embedding (optional)
torch.save(embedding, "speaker_embedding.pt")

# Step 4: Use the embedding to synthesize speech
text = "Hello, this is your custom voice!"

# Pass the speaker embedding to the tts method instead of in the constructor
wav = synthesizer.tts(text, speaker_embedding=embedding)

# Save the synthesized speech to an audio file
torchaudio.save("output_speech.wav", wav.unsqueeze(0), sample_rate)

ModuleNotFoundError: No module named 'TTS'

In [None]:
!pip install resemblyzer

Collecting resemblyzer
  Downloading Resemblyzer-0.1.4-py3-none-any.whl.metadata (5.8 kB)
Collecting webrtcvad>=2.0.10 (from resemblyzer)
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting typing (from resemblyzer)
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading Resemblyzer-0.1.4-py3-none-any.whl (15.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m103.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: webrtcvad, typing
  Building wheel for webrtcvad (setup.py) ... [?25l[?25hdone
  Created wheel for webrtcvad: filename=webrtcvad-2.0.10-cp310-cp310-linu

In [None]:
!pip install speechbrain

Collecting speechbrain
  Downloading speechbrain-1.0.1-py3-none-any.whl.metadata (24 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)
Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain)
  Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Downloading speechbrain-1.0.1-py3-none-any.whl (807 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m807.2/807.2 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)
Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ruamel.yaml.

In [None]:
import torchaudio
import torch
from speechbrain.pretrained import EncoderClassifier

# Load pre-trained speaker embedding model (ECAPA-TDNN)
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmpdir")

# Load your audio file
signal, fs = torchaudio.load("/content/Recording (3).mp3")

# Extract speaker embeddings
embeddings = classifier.encode_batch(signal)

# Save the embeddings as a .pt file
torch.save(embeddings, "speaker_embedding.pt")

print("Speaker embedding shape:", embeddings.shape)
print("Speaker embedding:", embeddings)
print("Embeddings saved to speaker_embedding.pt")


  from speechbrain.pretrained import EncoderClassifier


hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)


Speaker embedding shape: torch.Size([2, 1, 192])
Speaker embedding: tensor([[[ 18.6178,  18.2741,  -6.5986,  18.3758,  30.2301,  17.0390,  37.3764,
           11.5650, -11.9823,  13.7152,   3.6827,   1.9416, -11.9796,  24.5610,
          -15.4900,   6.4112,  -4.5491,  39.2041,   1.8247,  26.3106,  33.9382,
            8.7079, -33.5390,  19.1385,  21.8791,  13.9973,  -6.3077,  37.2694,
            6.4765, -45.0363, -29.6866, -23.1685,  -6.6202,  -9.5593,  10.7772,
           17.4275,  27.8355,   7.3093,  10.6068,  58.0147,   8.7371,   2.1782,
           16.5614, -29.4978, -14.5370, -15.8538, -15.7908, -24.0465,  47.6750,
          -15.9792,  14.9432,   2.9524,   1.0995,  -6.7969,  -9.1487, -31.3943,
           -3.6731,  -2.1154, -36.5860, -25.3069,  47.4683,  -1.4619, -22.4382,
           14.2463,  13.9144, -17.6660,  14.7957,   2.0066,   4.5435, -14.4444,
           34.7605,  -7.8900,   4.0353,  -4.8533, -11.4861,   2.2811,  -2.3447,
           11.6216, -23.4162,  -6.1004, -19.7948,  2

In [None]:
!pip install --upgrade numpy pandas librosa

Collecting numpy
  Using cached numpy-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting librosa
  Using cached librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached librosa-0.10.2.post1-py3-none-any.whl (260 kB)
Installing collected packages: pandas, librosa
  Attempting uninstall: pandas
    Found existing installation: pandas 1.5.3
    Uninstalling pandas-1.5.3:
      Successfully uninstalled pandas-1.5.3
  Attempting uninstall: librosa
    Found existing installation: libros

In [None]:
import torch
import torchaudio
from TTS.utils.synthesizer import Synthesizer
from TTS.utils.generic_utils import download_model
from speechbrain.pretrained import EncoderClassifier

# Load pre-trained speaker embedding model (ECAPA-TDNN)
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmpdir")

# Function to get speaker embedding
def get_embedding(audio_path):
    signal, fs = torchaudio.load(audio_path)
    embedding = classifier.encode_batch(signal)
    return embedding.squeeze()  # Remove batch dimension if needed

# Load the speaker embedding from a voice sample (reference voice)
speaker_embedding = get_embedding("/content/Recording (3).mp3")

# Download the LJSpeech TTS model if not already present
tts_model_dir = "tts_models/en/ljspeech/tacotron2"
# This line downloads the model and its config
download_model(tts_model_dir)

# Initialize the TTS synthesizer with config and model checkpoint
synthesizer = Synthesizer(
    tts_checkpoint=f"{tts_model_dir}/model.pth",  # Path to model weights
    tts_config_path=f"{tts_model_dir}/config.json",  # Path to model config
)

# Text to be converted into speech
text = "Hello, this is your custom voice!"

# Generate speech using the speaker embedding
wav = synthesizer.tts(text, speaker_embedding=speaker_embedding)

# Save the output to a .wav file
torchaudio.save("output_speech.wav", wav.unsqueeze(0), 22050)  # Assuming 22.05kHz sample rate

print("Speech synthesized and saved to 'output_speech.wav'.")


ImportError: cannot import name 'download_model' from 'TTS.utils.generic_utils' (/usr/local/lib/python3.10/dist-packages/TTS/utils/generic_utils.py)

In [None]:
!pip install --force-reinstall protobuf

Collecting protobuf
  Downloading protobuf-5.28.2-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading protobuf-5.28.2-cp38-abi3-manylinux2014_x86_64.whl (316 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m307.2/316.6 kB[0m [31m13.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.6/316.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.19.6
    Uninstalling protobuf-3.19.6:
      Successfully uninstalled protobuf-3.19.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-ai-generativelanguage 0.6.6 requires protobuf!=3.20.0,

In [None]:
import torch
from transformers import SpeechT5Processor, SpeechT5ForSpeechToText, Wav2Vec2Processor, Wav2Vec2Model
import torchaudio

# Load the processor and model for voice embeddings
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")

# Load your mp3 file and extract features
def extract_voice_embedding(mp3_file):
    waveform, sample_rate = torchaudio.load(mp3_file)
    # Resample to 16kHz
    resampler = torchaudio.transforms.Resample(sample_rate, 16000)
    waveform = resampler(waveform)
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state
    return embeddings

# Example usage
mp3_file = '/content/Recording (3).mp3'
voice_embedding = extract_voice_embedding(mp3_file)
print(voice_embedding.shape)  # To see the embedding size

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xf

RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xf

RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xf

SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception

In [None]:
# Clone Real-Time Voice Cloning (SV2TTS) repository which integrates Tacotron 2 with voice embeddings
!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git
%cd Real-Time-Voice-Cloning

# Install dependencies
!pip install -q -r requirements.txt

# Download pretrained models (encoder, synthesizer, vocoder)
!wget https://github.com/CorentinJ/Real-Time-Voice-Cloning/releases/download/v1.0/pretrained.zip
!unzip pretrained.zip
