In [1]:
import os
from pydub import AudioSegment

# Input audio file
filename = "ai_project_audio.mp3"

# Check if the file exists
print("🔍 Checking audio file...")
if not os.path.exists(filename):
    raise FileNotFoundError(f"❌ File not found: {filename}")
else:
    print("✅ File found.")


🔍 Checking audio file...
✅ File found.


In [2]:
import whisper

# Load Whisper model
print("🎙️ Loading Whisper model...")
model = whisper.load_model("medium")

# Transcribe audio
print("📝 Transcribing...")
result = model.transcribe(filename, language="en")
transcript = result["text"]
print("✅ Transcript:", transcript)


🎙️ Loading Whisper model...
📝 Transcribing...




✅ Transcript:  Rhea had no cake on her birthday. She felt sad. Her little brother drew a cake on paper. When she saw this, she felt very happy.


In [3]:
from googletrans import Translator

# Translate transcript to Hindi
translator = Translator()
translated = translator.translate(transcript, src='en', dest='hi')
translated_text = translated.text
print("🈶 Translated Text (Hindi):", translated_text)


🈶 Translated Text (Hindi): रिया के जन्मदिन पर कोई केक नहीं था।वह दुखी महसूस कर रही थी।उसके छोटे भाई ने कागज पर एक केक खींचा।जब उसने यह देखा, तो उसे बहुत खुशी हुई।


In [4]:
# Extract a short 10-second voice sample for cloning
clip = AudioSegment.from_mp3(filename)[:5000]  
ref_clip_path = "reference.wav"
clip.export(ref_clip_path, format="wav")
print(f"✅ Saved reference audio: {ref_clip_path}")


✅ Saved reference audio: reference.wav


In [5]:
import librosa
import numpy as np

# Load audio
y, sr = librosa.load(filename)

# Compute pitch (YIN) and energy (RMS)
pitch = librosa.yin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
energy = np.mean(librosa.feature.rms(y=y))

# Store emotion embedding
avg_pitch = float(np.mean(pitch))
pitch_var = float(np.var(pitch))
energy_val = float(energy)

emotion_embedding = {
    "avg_pitch": avg_pitch,
    "pitch_variance": pitch_var,
    "energy": energy_val
}
print("🎭 Emotion features:", emotion_embedding)


🎭 Emotion features: {'avg_pitch': 468.3760032860025, 'pitch_variance': 302111.39765560115, 'energy': 0.09959428012371063}


In [6]:
from torch.serialization import add_safe_globals

# XTTS-related configs and models
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import XttsAudioConfig, XttsArgs
from TTS.config.shared_configs import BaseDatasetConfig

# Add all necessary safe globals for Coqui XTTS
add_safe_globals([
    XttsConfig,
    XttsAudioConfig,
    XttsArgs,
    BaseDatasetConfig,
])




from TTS.api import TTS

# Load Coqui XTTS model
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True, gpu=False)

# Generate initial Hindi speech
intermediate_audio = "output_xtts_raw.wav"
tts.tts_to_file(
    text=translated_text,
    speaker_wav=ref_clip_path,
    language="hi",
    file_path=intermediate_audio
)
print("✅ Raw XTTS output saved:", intermediate_audio)



  from .autonotebook import tqdm as notebook_tqdm


 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


 > Text splitted to sentences.
['रिया के जन्मदिन पर कोई केक नहीं था।वह दुखी महसूस कर रही थी।उसके छोटे भाई ने कागज पर एक केक खींचा।जब उसने यह देखा, तो उसे बहुत खुशी हुई।']


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 > Processing time: 39.406291484832764
 > Real-time factor: 3.0826358320108502
✅ Raw XTTS output saved: output_xtts_raw.wav


In [22]:
import numpy as np
import librosa
import soundfile as sf
import pyrubberband as pyrb
from scipy.ndimage import gaussian_filter1d

# Load audio
y_out, sr_out = librosa.load("output_xtts_raw.wav", sr=None)
y_out = y_out.astype(np.float32)

# === Pitch shift (very subtle) ===
baseline_pitch = 150
semitone_shift = np.log2(avg_pitch / baseline_pitch) * 12
semitone_shift = float(np.clip(semitone_shift, -0.5, 0.5))  # subtle and capped
print(f"🔧 Pitch shift (semitones): {semitone_shift:.2f}")

y_shifted = pyrb.pitch_shift(y_out, sr_out, n_steps=semitone_shift)

# === Smooth energy envelope scaling ===
frame_length = 2048
hop_length = 512

# Extract frame-wise RMS energy and smooth it
rms = librosa.feature.rms(y=y_shifted, frame_length=frame_length, hop_length=hop_length)[0]
rms_smooth = gaussian_filter1d(rms, sigma=3)

# Target energy factor (relative to neutral reference)
target_rms = energy_val  # from emotion embedding
ref_rms = 0.05
scaling_curve = np.clip(np.sqrt(target_rms / ref_rms) * (rms_smooth / np.mean(rms_smooth)), 0.5, 2.0)

# Map frame-wise scaling back to full signal length
frame_centers = librosa.frames_to_samples(np.arange(len(scaling_curve)), hop_length=hop_length)
scaling_envelope = np.interp(np.arange(len(y_shifted)), frame_centers, scaling_curve)

# Apply the smoothed gain envelope
y_final = y_shifted * scaling_envelope
y_final = np.clip(y_final, -1.0, 1.0)  # avoid clipping

# Save output
final_output = "output_xtts_emotion.wav"
sf.write(final_output, y_final, sr_out)
print("✅ Final audio with smooth emotion modulation saved as:", final_output)


🔧 Pitch shift (semitones): 0.50
✅ Final audio with smooth emotion modulation saved as: output_xtts_emotion.wav
