In [None]:
!jupyter nbconvert --to notebook --inplace --ClearOutputPreprocessor.enabled=True Speech_2.ipynb


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!pip install torch torchaudio librosa transformers matplotlib soundfile


In [None]:
import torch
import torchaudio
import librosa
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

audio_path = "/content/drive/MyDrive/Speech-2/LJ050-0274.wav"

signal, sr = librosa.load(audio_path, sr=16000, mono=True)

print("Sample Rate:", sr)
print("Duration (sec):", len(signal)/sr)

time = np.arange(len(signal)) / sr

plt.figure(figsize=(12,4))
plt.plot(time, signal)
plt.title("Speech Waveform (LJ Speech)")
plt.xlabel("Time (seconds)")
plt.ylabel("Amplitude")
plt.show()

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

inputs = processor(signal, sampling_rate=sr, return_tensors="pt", padding=True)

with torch.no_grad():
    logits = model(inputs.input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)
phonemes = processor.batch_decode(predicted_ids)

print("Recognized Phonemes / Tokens:")
print(phonemes[0])

num_tokens = predicted_ids.shape[1]
total_duration = len(signal) / sr

time_per_token = total_duration / num_tokens
print("Approx time per phoneme (sec):", time_per_token)

# Example: extract phoneme at index 10
phoneme_index = 10

start_time = phoneme_index * time_per_token
end_time = (phoneme_index + 1) * time_per_token

start_sample = int(start_time * sr)
end_sample = int(end_time * sr)

phoneme_signal = signal[start_sample:end_sample]

plt.figure(figsize=(6,3))
plt.plot(phoneme_signal)
plt.title(f"Extracted Phoneme (Index {phoneme_index})")
plt.xlabel("Samples")
plt.ylabel("Amplitude")
plt.show()

sf.write("extracted_phoneme.wav", phoneme_signal, sr)

