In [1]:
import torch
import torchaudio
import torchaudio.transforms as T
import matplotlib.pyplot as plt
from IPython.display import Audio
from pathlib import Path
import soundfile as sf
import numpy as np

In [2]:
%matplotlib inline

In [3]:
import matplotlib
print(matplotlib.__version__)


3.7.1


In [4]:
torchaudio.set_audio_backend("soundfile")


  torchaudio.set_audio_backend("soundfile")


In [5]:
audio_path = r"C:\Users\Christian\Desktop\YembaTones\YembaTones An Annotated Dataset for Tonal and Syllabic Analysis of the Yemba Language\Yemba_Dataset\audios\speaker_1\group_1\spkr_1_group_1_statement_1.wav"

# Charger le fichier audio (retourne un tableau numpy)
waveform, sample_rate = sf.read(audio_path)

print(f"📏 Forme : {waveform.shape}")
print(f"🎧 Fréquence d’échantillonnage : {sample_rate} Hz")


📏 Forme : (29076, 2)
🎧 Fréquence d’échantillonnage : 44100 Hz


In [6]:
if isinstance(waveform, np.ndarray):
    waveform = torch.tensor(waveform.T, dtype=torch.float32)  # [channel, time]


In [7]:
target_sr = 16000  # exemple
if sample_rate != target_sr:
    resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sr)
    waveform = resampler(waveform)
    sample_rate = target_sr
    print("✅ Resampling effectué.")


✅ Resampling effectué.


In [8]:
# 📊 3. Extraction du Log-MelSpectrogram
mel_transform = T.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=400,
    win_length=400,
    hop_length=160,
    n_mels=80
)
mel_spec = mel_transform(waveform)
log_mel_spec = torch.log1p(mel_spec)  # log(1 + x)
print(f"Log-MelSpectrogram shape : {log_mel_spec.shape}")

Log-MelSpectrogram shape : torch.Size([2, 80, 66])


In [9]:
print(log_mel_spec.shape)

torch.Size([2, 80, 66])


In [10]:
# 1. On extrait une copie du canal 0
spec = log_mel_spec[0]  # forme : [mel, time]

# 2. Détache du graphe, force sur CPU, transforme en numpy array
spec_np = spec.detach().cpu().numpy()

# 3. Optionnel : vérifier l’absence de valeurs aberrantes
print("Contient NaN :", np.isnan(spec_np).any())
print("Max :", np.max(spec_np), "Min :", np.min(spec_np))


Contient NaN : False
Max : 3.5042062 Min : 3.1626095e-07


In [11]:
import matplotlib
matplotlib.use('Agg')  # Pour désactiver les backends interactifs


In [None]:
import matplotlib.pyplot as plt
import numpy as np

spec_np = log_mel_spec[0].detach().cpu().numpy()

plt.figure(figsize=(10, 4))
plt.imshow(spec_np, aspect='auto', origin='lower', interpolation='nearest')
plt.title("Log-MelSpectrogram")
plt.xlabel("Frames")
plt.ylabel("Bandes Mel")
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.savefig("spectrogram.png")
plt.close()

print("✅ Image sauvegardée dans spectrogram.png")

In [None]:
# 4. Affichage
plt.figure(figsize=(10, 4))
plt.imshow(spec_np, aspect='auto', origin='lower', interpolation='nearest')
plt.title("Log-MelSpectrogram (Canal 0)")
plt.xlabel("Frames")
plt.ylabel("Bandes Mel")
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Affiche canal 0 (ex : gauche)
plt.figure(figsize=(10, 4))
plt.imshow(log_mel_spec[0].detach().cpu().numpy(), aspect='auto', origin='lower')
plt.title("Log-MelSpectrogram (Canal 0)")
plt.xlabel("Frames (temps)")
plt.ylabel("Bandes Mel")
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()

# Optionnel : afficher canal 1 aussi
plt.figure(figsize=(10, 4))
plt.imshow(log_mel_spec[1].detach().cpu().numpy(), aspect='auto', origin='lower')
plt.title("Log-MelSpectrogram (Canal 1)")
plt.xlabel("Frames (temps)")
plt.ylabel("Bandes Mel")
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()


In [None]:
# 🖼️ 4. Visualisation du Log-MelSpectrogram
plt.figure(figsize=(10, 4))
plt.imshow(log_mel_spec[0].detach().numpy(), aspect='auto', origin='lower')
plt.title("Log-MelSpectrogram")
plt.xlabel("Frames (temps)")
plt.ylabel("Bandes Mel")
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()

# ✅ Ce spectrogramme peut maintenant être envoyé à un modèle RNN (BiLSTM + CTC) comme entrée.
