In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
dataset_path = '/content/drive/MyDrive/Emotions'


In [None]:
import os
import pandas as pd
from tqdm import tqdm
import csv  # Using Python's built-in CSV writer as fallback

# Config - adjust these to match your exact dataset structure
DATASET_PATH = "/content/drive/MyDrive/Emotions"
EMOTIONS = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Suprised"]
COMMON_TEXT = "kids are talking by the door"
OUTPUT_CSV = "metadata.csv"

def generate_metadata():
    """Safe metadata generation with comprehensive error handling"""
    rows = []
    for emotion in tqdm(EMOTIONS, desc="Processing emotions"):
        emotion_dir = os.path.join(DATASET_PATH, emotion)

        if not os.path.exists(emotion_dir):
            print(f"\n⚠️ Missing folder: {emotion}")
            continue

        try:
            files = [f for f in os.listdir(emotion_dir)
                   if f.endswith('.wav') and os.path.isfile(os.path.join(emotion_dir, f))]
        except Exception as e:
            print(f"\n❌ Error reading {emotion}: {str(e)}")
            continue

        for file in files:
            rows.append({
                'audio_path': os.path.join(emotion_dir, file),
                'text': COMMON_TEXT,
                'emotion': emotion.lower()
            })
    return rows

print("🚀 Generating metadata...")
metadata = generate_metadata()

if metadata:
    print("\n💾 Attempting to save metadata...")

    # Method 1: Try pandas first
    try:
        df = pd.DataFrame(metadata)
        df.to_csv(OUTPUT_CSV, index=False)
        print(f"✅ Successfully saved {len(df)} entries using pandas")
    except Exception as e:
        print(f"⚠️ Pandas save failed: {str(e)}\nTrying alternative method...")

        # Method 2: Pure Python CSV writer
        try:
            with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=['audio_path', 'text', 'emotion'])
                writer.writeheader()
                writer.writerows(metadata)
            print(f"✅ Saved {len(metadata)} entries using CSV module")
        except Exception as e:
            print(f"❌ All save methods failed: {str(e)}")
else:
    print("\n❌ No files processed. Please verify:")
    print(f"- Dataset exists: {os.path.exists(DATASET_PATH)}")
    print(f"- Subfolders: {os.listdir(DATASET_PATH)}")

# Verify output
if os.path.exists(OUTPUT_CSV):
    print("\n🔍 First 3 lines of generated CSV:")
    !head -n 3 {OUTPUT_CSV}

🚀 Generating metadata...


Processing emotions: 100%|██████████| 7/7 [00:03<00:00,  1.80it/s]



💾 Attempting to save metadata...
✅ Successfully saved 12808 entries using pandas

🔍 First 3 lines of generated CSV:
audio_path,text,emotion
/content/drive/MyDrive/Emotions/Angry/1053_TIE_ANG_XX.wav,kids are talking by the door,angry
/content/drive/MyDrive/Emotions/Angry/1051_IWL_ANG_XX.wav,kids are talking by the door,angry


In [None]:
import pandas as pd
df = pd.read_csv("metadata.csv")
print(f"Total files: {len(df)}")
print("Emotion distribution:")
print(df["emotion"].value_counts())

Total files: 12808
Emotion distribution:
emotion
angry        2177
sad          2167
happy        2167
fearful      2047
disgusted    1863
neutral      1795
suprised      592
Name: count, dtype: int64


In [None]:
# 1. Install required packages
!pip install -q transformers==4.33.3 torchaudio==2.0.2 soundfile numpy

# 2. Import libraries
from transformers import VitsModel, AutoTokenizer
import torch
import numpy as np
from IPython.display import Audio
import io
import soundfile as sf

# 3. Load pretrained model
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

# 4. Fixed text-to-speech function
def text_to_speech(text, emotion="neutral"):
    # Convert text to tokens
    inputs = tokenizer(text, return_tensors="pt")

    # Generate speech (no emotion params in this model)
    with torch.no_grad():
        output = model(**inputs)

    # Convert to numpy array and normalize
    audio = output.waveform[0].numpy()
    audio = audio / np.max(np.abs(audio))  # Normalize

    # Create in-memory WAV file
    with io.BytesIO() as wav_buffer:
        sf.write(wav_buffer, audio, samplerate=model.config.sampling_rate, format='WAV')
        wav_buffer.seek(0)
        return Audio(wav_buffer.read())

# 5. Test it
text = "i want to die"
text_to_speech(text)  # Plays audio directly in notebook

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m90.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m128.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m105.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/145M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/47.0 [00:00<?, ?B/s]

In [None]:
from transformers import pipeline

# Load a pre-trained emotion classifier
emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

def predict_emotion(text):
    # Predict emotion based on the text input
    result = emotion_classifier(text)
    emotion = result[0]['label']  # Get the predicted emotion label
    return emotion

# Example usage
text = "I am so afraid today!"
predicted_emotion = predict_emotion(text)
print(f"Predicted Emotion: {predicted_emotion}")


Predicted Emotion: fear


In [None]:
import torch
from transformers import VitsModel, AutoTokenizer
import numpy as np
from IPython.display import Audio
import io
import soundfile as sf

# Load pretrained TTS model and tokenizer
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

# Emotion mapping (you can expand this as per your dataset's emotions)
emotion_map = {
    'joy': 'Happy',
    'anger': 'Angry',
    'sadness': 'Sad',
    'fear': 'Fearful',
    'surprise': 'Surprised',
    'neutral': 'Neutral',
    'disgust': 'Disgusted'
}

# 3. Text-to-Speech with Emotion (using predicted emotion)
def text_to_speech_with_emotion(text):
    # Predict emotion from the text
    predicted_emotion = predict_emotion(text)

    # Append the emotion label to the text (this might guide the model to choose the appropriate tone)
    text_with_emotion = f"{predicted_emotion}: {text}"

    # Convert text to tokens
    inputs = tokenizer(text_with_emotion, return_tensors="pt")

    # Generate speech (no emotion parameters in this model)
    with torch.no_grad():
        output = model(**inputs)

    # Convert to numpy array and normalize
    audio = output.waveform[0].numpy()
    audio = audio / np.max(np.abs(audio))  # Normalize

    # Create in-memory WAV file
    with io.BytesIO() as wav_buffer:
        sf.write(wav_buffer, audio, samplerate=model.config.sampling_rate, format='WAV')
        wav_buffer.seek(0)
        return Audio(wav_buffer.read())

# Test it
text = "I am feeling so scared today!"
audio = text_to_speech_with_emotion(text)
audio


In [None]:
import os
import librosa
import torch
import concurrent.futures
import numpy as np

# Function to extract mel-spectrogram from an audio file
def extract_mel_spectrogram(audio_path, sample_rate=22050):
    y, sr = librosa.load(audio_path, sr=sample_rate)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
    return mel_spec

# Function to process each audio file
def process_audio_file(audio_path, emotion_map):
    emotion_label = get_emotion_label_from_path(audio_path, emotion_map)
    mel_spec = extract_mel_spectrogram(audio_path)
    return mel_spec, emotion_label

# Function to extract emotion label from path
def get_emotion_label_from_path(audio_path, emotion_map):
    folder_name = audio_path.split("/")[-2]  # Get the folder name as the emotion
    return emotion_map.get(folder_name, "neutral")

# Emotion mapping
emotion_map = {
    'Happy': 'Happy',
    'Angry': 'Angry',
    'Sad': 'Sad',
    'Fearful': 'Fearful',
    'Suprised': 'Suprised',
    'Neutral': 'Neutral',
    'Disgusted': 'Disgusted'
}

# Dataset path
dataset_path = "/content/drive/MyDrive/Emotions"
dataset = []

# Using concurrent.futures for parallel processing
with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_audio = {}

    for emotion in emotion_map.keys():
        emotion_folder = os.path.join(dataset_path, emotion)
        for audio_file in os.listdir(emotion_folder):
            if audio_file.endswith(".wav"):
                audio_path = os.path.join(emotion_folder, audio_file)
                future = executor.submit(process_audio_file, audio_path, emotion_map)
                future_to_audio[future] = audio_path

    for future in concurrent.futures.as_completed(future_to_audio):
        mel_spec, emotion_label = future.result()
        dataset.append((mel_spec, emotion_label))

# Convert dataset to torch tensors for training
mel_specs = [torch.tensor(mel) for mel, _ in dataset]
emotion_labels = [label for _, label in dataset]

# You can now continue training your model with the processed dataset


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(emotion_labels)

# Now, encoded_labels are integers


In [None]:
from torch.utils.data import Dataset, DataLoader

class EmotionDataset(Dataset):
    def __init__(self, mel_specs, labels):
        self.mel_specs = mel_specs
        self.labels = labels

    def __len__(self):
        return len(self.mel_specs)

    def __getitem__(self, idx):
        return self.mel_specs[idx], torch.tensor(self.labels[idx])

# Create Dataset and DataLoader
dataset = EmotionDataset(mel_specs, encoded_labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class EmotionCNN(nn.Module):
    def __init__(self, num_classes):
        super(EmotionCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(32 * 20 * 20, 128)  # adjust size based on your input
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = EmotionCNN(num_classes=len(label_encoder.classes_))


In [None]:
class EmotionDataset(Dataset):
    def __init__(self, mel_specs, labels, fixed_length=100):
        self.mel_specs = mel_specs
        self.labels = labels
        self.fixed_length = fixed_length

    def __len__(self):
        return len(self.mel_specs)

    def __getitem__(self, idx):
        mel = self.mel_specs[idx]
        label = self.labels[idx]

        # Pad or truncate to fixed length
        if mel.shape[1] < self.fixed_length:
            pad_width = self.fixed_length - mel.shape[1]
            mel = torch.nn.functional.pad(mel, (0, pad_width))
        else:
            mel = mel[:, :self.fixed_length]

        return mel, torch.tensor(label)


In [None]:
dataset = EmotionDataset(mel_specs, encoded_labels, fixed_length=100)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [None]:
import torch.nn as nn

class EmotionClassifier(nn.Module):
    def __init__(self, num_emotions):
        super(EmotionClassifier, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2,2))
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 20 * 25, 128),
            nn.ReLU(),
            nn.Linear(128, num_emotions)
        )

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.conv(x)
        x = self.fc(x)
        return x


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode emotion labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(emotion_labels)

# Now you have encoded_labels = [0, 1, 2, ..., 6]


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = EmotionClassifier(num_emotions=len(label_encoder.classes_)).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training loop
for epoch in range(10):  # you can change number of epochs
    model.train()
    for mel, label in dataloader:
        mel = mel.to(device).float()
        label = label.to(device)

        optimizer.zero_grad()
        output = model(mel)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} completed")

# Save your model
torch.save(model.state_dict(), "emotion_classifier.pth")


Epoch 1 completed
Epoch 2 completed
Epoch 3 completed
Epoch 4 completed
Epoch 5 completed
Epoch 6 completed
Epoch 7 completed
Epoch 8 completed
Epoch 9 completed
Epoch 10 completed


In [None]:
from transformers import pipeline

# Use HuggingFace emotion classifier
emotion_pipeline = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=1)

def predict_emotion_from_text(text):
    result = emotion_pipeline(text)
    emotion = result['label'].lower()  # No [0]
    return emotion


In [None]:
import torch

class EmotionClassifier(torch.nn.Module):
    def __init__(self, num_classes):
        super(EmotionClassifier, self).__init__()
        # Define the layers to match the saved model architecture
        self.conv = torch.nn.Sequential(
            torch.nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),  # Adjust as per saved model
            torch.nn.ReLU(),
            torch.nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # Additional convolution layers
            torch.nn.ReLU(),
            torch.nn.AdaptiveAvgPool2d((1, 1)),
            torch.nn.Flatten()
        )
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(64, 128),  # Adjust the input size based on the conv layers
            torch.nn.ReLU(),
            torch.nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return x

# Load model with strict=False to ignore missing or unexpected keys
checkpoint = torch.load("emotion_classifier.pth", map_location=device)

model = EmotionClassifier(num_classes=7)  # 7 emotions
model.load_state_dict(checkpoint, strict=False)

model.to(device)
model.eval()


EmotionClassifier(
  (conv): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): AdaptiveAvgPool2d(output_size=(1, 1))
    (5): Flatten(start_dim=1, end_dim=-1)
  )
  (fc): Sequential(
    (0): Linear(in_features=64, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=7, bias=True)
  )
)

In [None]:
from transformers import pipeline

# Load text emotion model (like 'j-hartmann/emotion-english-distilroberta-base')
emotion_pipeline = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=False)

def predict_emotion_from_text(text):
    result = emotion_pipeline(text)
    predicted_emotion = result[0]['label'].lower()  # Example output: "joy", "anger", etc.
    return predicted_emotion


In [None]:
from transformers import VitsModel, AutoTokenizer
import numpy as np
import io
import soundfile as sf
from IPython.display import Audio

# Load TTS model
tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

def generate_emotional_speech(text):
    # Predict the emotion
    emotion = predict_emotion_from_text(text)

    # Inject emotion into prompt
    text_with_emotion = f"{emotion}: {text}"

    # Tokenize and synthesize
    inputs = tts_tokenizer(text_with_emotion, return_tensors="pt")
    with torch.no_grad():
        output = tts_model(**inputs)

    # Prepare audio
    audio = output.waveform[0].cpu().numpy()
    audio = audio / np.max(np.abs(audio))  # Normalize
    with io.BytesIO() as wav_buffer:
        sf.write(wav_buffer, audio, samplerate=tts_model.config.sampling_rate, format='WAV')
        wav_buffer.seek(0)
        return Audio(wav_buffer.read())


In [None]:
speech = generate_emotional_speech("I am feeling very scared today!")
speech  # It will generate audio 🎤


In [None]:
pip install pyttsx3


Collecting pyttsx3
  Downloading pyttsx3-2.98-py3-none-any.whl.metadata (3.8 kB)
Downloading pyttsx3-2.98-py3-none-any.whl (34 kB)
Installing collected packages: pyttsx3
Successfully installed pyttsx3-2.98


In [None]:
!apt-get update
!apt-get install espeak


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Connecting to archive.ubuntu.com] [Waiting for headers] [1 InRelease 0 B/3,632 B 0%] [Connected                                                                                                     Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [75.2 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubun

In [None]:
import pyttsx3

engine = pyttsx3.init()
engine.setProperty('rate', 150)  # Speed of speech
engine.setProperty('volume', 1)  # Volume (0.0 to 1.0)
engine.setProperty('voice', 'english+f4')  # You can change the voice here (male/female)

# Generate speech with a basic emotion setting
engine.say("I am feeling very scared today!")
engine.runAndWait()


In [None]:
voices = engine.getProperty('voices')
engine.setProperty('voice', voices[1].id)  # Change index to select different voice (0 for male, 1 for female)


In [None]:
engine.say("I am feeling... very scared today!")  # Adding pause between "I am feeling" and "very scared today!"


In [None]:
!pip install gTTS


Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gTTS
Successfully installed gTTS-2.5.4


In [None]:
from gtts import gTTS
import os

# Your sentence with emotional content
text = "I am feeling very scared today!"

# Initialize Google TTS
tts = gTTS(text=text, lang='en', slow=False)

# Save the audio file
tts.save("scared_speech.mp3")

# Play the generated speech in Colab
from IPython.display import Audio
Audio("scared_speech.mp3")


In [None]:
!pip install transformers
!pip install TTS
!pip install torch torchaudio


Collecting TTS
  Downloading TTS-0.22.0-cp311-cp311-manylinux1_x86_64.whl.metadata (21 kB)
Collecting torch>=2.1 (from TTS)
  Downloading torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting pandas<2.0,>=1.4 (from TTS)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting trainer>=0.0.32 (from TTS)
  Downloading trainer-0.0.36-py3-none-any.whl.metadata (8.1 kB)
Collecting coqpit>=0.0.16 (from TTS)
  Downloading coqpit-0.0.17-py3-none-any.whl.metadata (11 kB)
Collecting pypinyin (from TTS)
  Downloading pypinyin-0.54.0-py2.py3-none-any.whl.metadata (12 kB)
Collecting hangul-romanize (from TTS)
  Downloading hangul_romanize-0.1.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gruut==2.2.3 (from gruut[de,es,fr]=



In [None]:
from TTS.api import TTS

# Correct model path to an expressive TTS model (assuming a valid path exists)
tts = TTS(model_name="tts_models/en/emotion/fastspeech2", gpu=True)

# Available speakers list
available_speakers = tts.speakers
print("Available speakers:", available_speakers)

# Example speech generation with emotion control
def generate_expressive_speech(prompt, output_path="output.wav"):
    # Add emotion tags (you can detect emotion dynamically using an emotion classifier)
    emotion_tag = "happy"  # For example, using a detected emotion

    # Generate speech with emotion control
    tts.tts_to_file(
        text=prompt,
        file_path=output_path,
        speaker=available_speakers[0],  # Choose the speaker
        language="en",                 # Language (English here)
        emotion=emotion_tag            # Emotion tag
    )
    print(f"✅ Emotional Speech saved to {output_path}")

# Example usage
prompt = "I'm so excited to see you!"
generate_expressive_speech(prompt)


KeyError: 'emotion'

In [None]:
from TTS.utils.manage import ModelManager

# Initialize ModelManager
model_manager = ModelManager()

# List all available models in the TTS repository
available_models = model_manager.get_available_models()

# Print out the available models
for model in available_models:
    print(model)


AttributeError: 'ModelManager' object has no attribute 'get_available_models'

In [None]:
from TTS.api import TTS

# Example: Loading an emotion-controlled model (ensure this model exists in your setup)
tts = TTS(model_name="tts_models/en/emotion/fastspeech2", gpu=True)

# Check available speakers
available_speakers = tts.speakers
if available_speakers is None or len(available_speakers) == 0:
    print("No available speakers found.")
else:
    print("Available speakers:", available_speakers)

    # Function to generate expressive speech
    def generate_emotional_speech(prompt, emotion="happy", output_path="output.wav"):
        tts.tts_to_file(
            text=prompt,
            file_path=output_path,
            speaker=available_speakers[0],  # Pick the first available speaker
            language="en",                  # Language
            emotion=emotion                 # Set emotion tag (e.g., "happy", "sad")
        )
        print(f"✅ Speech saved to {output_path}")

    # Example usage
    prompt = "I'm so excited to see you!"
    generate_emotional_speech(prompt, emotion="happy")


KeyError: 'emotion'

In [None]:
pip install transformers TTS torch



In [None]:
from transformers import pipeline
from TTS.api import TTS
import torch


In [None]:
# Emotion classifier using Hugging Face transformers
emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=False)


In [None]:
# Load a TTS model (replace this with the actual model you want to use)
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", gpu=True)


 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio P

In [None]:
# Function to detect emotion from the prompt
def detect_emotion(prompt):
    emotion = emotion_classifier(prompt)[0]['label'].lower()  # 'label' contains the predicted emotion
    return emotion


In [None]:
# Map detected emotions to speech parameters
def get_speech_parameters(emotion):
    if emotion == "happy":
        return {"speed": 1.2, "pitch": 1.2}  # Faster speed, higher pitch
    elif emotion == "sad":
        return {"speed": 0.8, "pitch": 0.8}  # Slower speed, lower pitch
    elif emotion == "anger":
        return {"speed": 1.3, "pitch": 1.0}  # Faster speed, normal pitch
    else:
        return {"speed": 1.0, "pitch": 1.0}  # Normal speed, normal pitch


In [None]:
# Function to generate expressive speech based on detected emotion
def generate_expressive_speech(prompt, output_path="output.wav"):
    # Step 1: Detect emotion from the prompt
    emotion = detect_emotion(prompt)
    print(f"Detected Emotion: {emotion}")

    # Step 2: Get corresponding speech parameters for the detected emotion
    params = get_speech_parameters(emotion)
    speed, pitch = params["speed"], params["pitch"]

    # Step 3: Generate speech with TTS model
    tts.tts_to_file(
        text=prompt,
        file_path=output_path,
        speaker=tts.speakers[0],  # Pick the first available speaker
        language="en",  # Language
        speed=speed,  # Emotion-controlled speed
        pitch=pitch  # Emotion-controlled pitch
    )
    print(f"✅ Speech saved to {output_path}")


**it's working till now**

In [None]:
from TTS.api import TTS
from transformers import pipeline

# 1. Load emotion classifier
emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=False)

# 2. Load the VITS model
try:
    tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=True)
except Exception as e:
    print(f"Error loading model: {e}")

# 3. Emotion detection function
def detect_emotion(prompt):
    emotion = emotion_classifier(prompt)[0]['label'].lower()  # Emotion extraction
    return emotion

# 4. Adjust speech parameters based on emotion
def get_speech_parameters(emotion):
    if emotion == "joy":
        return {"speed": 1.5, "pitch": 1.4}  # even more excited
    elif emotion == "sadness":
        return {"speed": 0.2, "pitch": 0.4}  # really slow and deep
    elif emotion == "anger":
        return {"speed": 1.7, "pitch": 1.3}  # sharp and fast
    else:
        return {"speed": 1.0, "pitch": 1.0}
  # Neutral parameters

# 5. Generate speech with expressive emotion
def generate_expressive_speech(prompt, output_path="output.wav"):
    # Detect emotion
    emotion = detect_emotion(prompt)
    print(f"Detected Emotion: {emotion}")

    # Get emotion-based parameters
    params = get_speech_parameters(emotion)
    speed, pitch = params["speed"], params["pitch"]

    # Generate the expressive speech
    tts.tts_to_file(
        text=prompt,
        file_path=output_path,
        speed=speed,  # Adjust speed based on emotion
        pitch=pitch  # Adjust pitch based on emotion
    )
    print(f"✅ Speech saved to {output_path}")

# Example usage
prompt = "I'm SOOO angry and i can't do this shit anymore!!!"  # You can modify this to test other emotions
generate_expressive_speech(prompt)




 > tts_models/en/ljspeech/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
Detected Emotion: anger
 > Text splitted to sentences.
["I'm SOOO angry and i can't do this shit anymore!!!"]
 > Processing time: 0.2000410556793213
 > Real-time factor: 0.04950288