In [3]:
import whisper
import os

# Set audio file name
filename = "ai_project_audio.mp3"

# Step 1: Check if file exists
print("🔍 Checking audio file...")
if not os.path.exists(filename):
    print(f"❌ File not found: {filename}")
    exit()

# Step 2: Load Whisper model
print("🎙️ Loading Whisper model...")
model = whisper.load_model("medium")

# Step 3: Transcribe with forced English
print("📝 Transcribing audio...")
result = model.transcribe(filename, language="en")

# Step 4: Output the result
print("✅ Full Result:", result)
print("📜 Transcript:", result["text"])

🔍 Checking audio file...
🎙️ Loading Whisper model...
📝 Transcribing audio...




✅ Full Result: {'text': ' Rhea had no cake on her birthday. She felt sad. Her little brother drew a cake on paper. When she saw this, she felt very happy.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 6.8, 'text': ' Rhea had no cake on her birthday. She felt sad. Her little brother drew a cake on paper.', 'tokens': [50364, 497, 27799, 632, 572, 5908, 322, 720, 6154, 13, 1240, 2762, 4227, 13, 3204, 707, 3708, 12804, 257, 5908, 322, 3035, 13, 50704], 'temperature': 0.0, 'avg_logprob': -0.20141316748954155, 'compression_ratio': 1.2190476190476192, 'no_speech_prob': 0.1239473819732666}, {'id': 1, 'seek': 0, 'start': 6.8, 'end': 9.68, 'text': ' When she saw this, she felt very happy.', 'tokens': [50704, 1133, 750, 1866, 341, 11, 750, 2762, 588, 2055, 13, 50848], 'temperature': 0.0, 'avg_logprob': -0.20141316748954155, 'compression_ratio': 1.2190476190476192, 'no_speech_prob': 0.1239473819732666}], 'language': 'en'}
📜 Transcript:  Rhea had no cake on her birthday. She felt sad. He

In [4]:
from googletrans import Translator

transcript = result["text"]
print("\n🌐 Translating to Hindi...")
translator = Translator()
translated = translator.translate(transcript, src='en', dest='hi')
translated_text = translated.text
print("🈶 Translated Text:", translated_text)



🌐 Translating to Hindi...
🈶 Translated Text: रिया के जन्मदिन पर कोई केक नहीं था।वह दुखी महसूस कर रही थी।उसके छोटे भाई ने कागज पर एक केक खींचा।जब उसने यह देखा, तो उसे बहुत खुशी हुई।


In [5]:
import librosa
from resemblyzer import VoiceEncoder, preprocess_wav

# Load MP3 using librosa
wav_array, sr = librosa.load(filename, sr=None)
wav = preprocess_wav(wav_array, source_sr=sr)

# Embed the utterance
encoder = VoiceEncoder()
speaker_embedding = encoder.embed_utterance(wav)
print("Speaker embedding shape:", speaker_embedding.shape)


Loaded the voice encoder model on cpu in 0.03 seconds.
Speaker embedding shape: (256,)


In [6]:
import librosa
import numpy as np

y, sr = librosa.load(filename)
pitch = librosa.yin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
energy = np.mean(librosa.feature.rms(y=y))

emotion_embedding = {
    "avg_pitch": float(np.mean(pitch)),
    "pitch_variance": float(np.var(pitch)),
    "energy": float(energy)
}
print("Step 4 - Emotion embedding (approx):", emotion_embedding)


Step 4 - Emotion embedding (approx): {'avg_pitch': 408.75398495630577, 'pitch_variance': 229220.76254593098, 'energy': 0.09959428012371063}


In [7]:
from gtts import gTTS
from pydub import AudioSegment

tts = gTTS(translated_text, lang="en")  # target language
tts.save("output_tts.mp3")

# Convert to .wav if needed
AudioSegment.from_mp3("output_tts.mp3").export("output_tts.wav", format="wav")
print("Step 5 - TTS done.")


Step 5 - TTS done.
