In [None]:
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import os
import numpy as np

# ***Audio to Spectogram***

In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import io
import numpy as np
from PIL import Image
import pydub
from scipy.io import wavfile
import torch
import torchaudio
import argparse
from typing import List
import os
from pathlib import Path

In [None]:
# Argümanları tanımlama
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_folder", type=str, default="/content/drive/MyDrive/dataset", help="Input folder containing music files")
parser.add_argument("-o", "--output_folder", type=str, default="output", help="Output Folder")
parser.add_argument("-m", "--maxvol", type=int, default=100, help="Max Volume, 255 for identical results")
parser.add_argument("-p", "--powerforimage", type=float, default=0.25, help="Power for Image")
parser.add_argument("-n", "--nmels", type=int, default=512, help="n_mels to use for Image, basically HEIGHT. Higher = more fidelity")
parser.add_argument("-d", "--duration", type=int, default=5119, help="Duration of each chunk")
args = parser.parse_args(args=["-i", "/content/drive/MyDrive/music_dataset/kudum", "-o", "kudum_output"])

# Argümanları int ve float olarak kontrol etme
args.input_folder = str(args.input_folder)
args.output_folder = str(args.output_folder)
args.maxvol = int(args.maxvol)
args.powerforimage = float(args.powerforimage)
args.nmels = int(args.nmels)
args.duration = int(args.duration)

def spectrogram_image_from_wav(
    wav_bytes: io.BytesIO,
    max_volume: float = 50,
    power_for_image: float = 0.25,
    ms_duration: int = 5119,
    nmels: int = 512) -> Image.Image:
    # Ses dosyasını oku
    sample_rate, waveform = wavfile.read(wav_bytes)

    clip_duration_ms = ms_duration

    bins_per_image = 512
    n_mels = nmels
    mel_scale = True

    # FFT parametreleri
    window_duration_ms = 100
    padded_duration_ms = 400
    step_size_ms = 10

    num_samples = int(512 / float(bins_per_image) * clip_duration_ms) * sample_rate
    n_fft = int(padded_duration_ms / 1000.0 * sample_rate)
    hop_length = int(step_size_ms / 1000.0 * sample_rate)
    win_length = int(window_duration_ms / 1000.0 * sample_rate)

    Sxx = spectrogram_from_waveform(
        waveform=waveform,
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        mel_scale=mel_scale,
        n_mels=n_mels,
    )

    image = image_from_spectrogram(
        Sxx,
        max_volume=max_volume,
        power_for_image=power_for_image)

    return image

def spectrogram_from_waveform(
    waveform: np.ndarray,
    sample_rate: int,
    n_fft: int,
    hop_length: int,
    win_length: int,
    mel_scale: bool = True,
    n_mels: int = 512,
) -> np.ndarray:
    spectrogram_func = torchaudio.transforms.Spectrogram(
        n_fft=n_fft,
        power=None,
        hop_length=hop_length,
        win_length=win_length,
    )

    waveform_tensor = torch.from_numpy(waveform.astype(np.float32)).reshape(1, -1)
    Sxx_complex = spectrogram_func(waveform_tensor).numpy()[0]

    Sxx_mag = np.abs(Sxx_complex)

    if mel_scale:
        mel_scaler = torchaudio.transforms.MelScale(
            n_mels=n_mels,
            sample_rate=sample_rate,
            f_min=0,
            f_max=10000,
            n_stft=n_fft // 2 + 1,
            norm=None,
            mel_scale="htk",
        )

        Sxx_mag = mel_scaler(torch.from_numpy(Sxx_mag)).numpy()

    return Sxx_mag

def image_from_spectrogram(
        data: np.ndarray,
        max_volume: float = 50,
        power_for_image: float = 0.25
) -> Image.Image:
    data = np.power(data, power_for_image)
    data = data / (max_volume / 255)
    data = 255 - data
    data = data[::-1]
    image = Image.fromarray(data.astype(np.uint8))
    return image

def spectrogram_images_from_folder(
    input_folder: str,
    output_folder: str,
    max_volume: float = 50,
    power_for_image: float = 0.25,
    nmels: int = 512,
    duration: int = 5119
) -> None:
    # Giriş klasöründeki ses dosyalarını işle
    for filename in os.listdir(input_folder):
        if filename.endswith(".mp3") or filename.endswith(".wav"):
            # Ses dosyasının tam yolunu oluştur
            audio_file_path = os.path.join(input_folder, filename)

            # Ses dosyasını işle ve görüntüleri kaydet
            process_audio_file(
                audio_file_path,
                output_folder,
                max_volume=max_volume,
                power_for_image=power_for_image,
                nmels=nmels,
                duration=duration
            )

def process_audio_file(
    audio_file_path: str,
    output_folder: str,
    max_volume: float = 50,
    power_for_image: float = 0.25,
    nmels: int = 512,
    duration: int = 5119
) -> None:
    # MP3 veya WAV dosyasını yükle
    audio = pydub.AudioSegment.from_file(audio_file_path)

    # Tek kanala dönüştür ve çerçeve hızını ayarla
    audio = audio.set_channels(1)
    audio = audio.set_frame_rate(44100)

    # Sesin 'duration' saniyelik aralıklarla kaç kısmı olduğunu hesapla
    interval_count = len(audio) // duration

    print("İŞLENECEK PARÇA SAYISI:", interval_count)

    # Boş bir liste oluştur, görüntüleri buraya ekleyeceğiz
    spectrogram_images = []

    # Parçaları işleyerek görüntüler oluştur
    for i in range(interval_count):
        print("İŞLENDİ:", i, "/", interval_count)
        # 'duration' saniyelik ses verisinin aralığını çıkar
        interval_audio = audio[i*duration:(i+1)*duration]

        # WAV olarak dönüştür ve BytesIO nesnesi olarak kaydet
        wav_bytes = io.BytesIO()
        interval_audio.export(wav_bytes, format="wav")
        wav_bytes.seek(0)

        # WAV dosyasından spektrogram görüntüsü oluştur
        spectrogram_image = spectrogram_image_from_wav(
            wav_bytes,
            max_volume=max_volume,
            power_for_image=power_for_image,
            ms_duration=duration,
            nmels=nmels
        )

        # Görüntüyü listeye ekle
        spectrogram_images.append(spectrogram_image)

        # Görüntüyü kaydet
        save_image(
            spectrogram_image,
            os.path.basename(audio_file_path),
            i,
            output_folder
        )

def save_image(image: Image.Image, base_filename: str, index: int, output_folder: str):
    subfolder_output_path = os.path.join(output_folder, base_filename)
    os.makedirs(subfolder_output_path, exist_ok=True)

    # Görüntüyü dosyaya kaydet
    output_filename = f"{subfolder_output_path}/{index:05d}.png"
    image.save(output_filename)

# Ana işlem
if __name__ == "__main__":
    spectrogram_images_from_folder(
        input_folder=args.input_folder,
        output_folder=args.output_folder,
        max_volume=args.maxvol,
        power_for_image=args.powerforimage,
        nmels=args.nmels,
        duration=args.duration
    )
    print("İŞLEM TAMAMLANDI")

**********************************************************

# **Spectogram to Audio**

In [None]:
import io
import typing as T

import numpy as np
from PIL import Image
import pydub
from scipy.io import wavfile
import torch
import torchaudio
import argparse


In [None]:

def spectrogram_from_image(
        image: Image.Image,
        max_volume: float = 50,
        power_for_image: float = 0.25
) -> np.ndarray:
    """
    Compute a spectrogram magnitude array from a spectrogram image.

    TODO(hayk): Add image_from_spectrogram and call this out as the reverse.
    """
    # Convert to a numpy array of floats
    data = np.array(image).astype(np.float32)
    # Flip Y take a single channel
    if len(data.shape) < 3:
        data = data[::-1]
    else:
        data = data[::-1, :, 0]
    # Invert
    data = 255 - data
    # Rescale to max volume
    data = data * max_volume / 255
    # Reverse the power curve
    data = np.power(data, 1 / power_for_image)
    return data

def waveform_from_spectrogram(
    Sxx: np.ndarray,
    n_fft: int,
    hop_length: int,
    win_length: int,
    num_samples: int,
    sample_rate: int,
    mel_scale: bool = True,
    n_mels: int = 512,
    max_mel_iters: int = 200,
    num_griffin_lim_iters: int = 32,
    device: str = "cuda:0",
) -> np.ndarray:
    """
    Reconstruct a waveform from a spectrogram.

    This is an approximate inverse of spectrogram_from_waveform, using the Griffin-Lim algorithm
    to approximate the phase.
    """
    Sxx_torch = torch.from_numpy(Sxx).to(device)

    # TODO(hayk): Make this a class that caches the two things

    if mel_scale:
        mel_inv_scaler = torchaudio.transforms.InverseMelScale(
            n_mels=n_mels,
            sample_rate=sample_rate,
            f_min=0,
            f_max=10000,
            n_stft=n_fft // 2 + 1,
            norm=None,
            mel_scale="htk",
            max_iter=max_mel_iters,
        ).to(device)

        Sxx_torch = mel_inv_scaler(Sxx_torch)

    griffin_lim = torchaudio.transforms.GriffinLim(
        n_fft=n_fft,
        win_length=win_length,
        hop_length=hop_length,
        power=1.0,
        n_iter=num_griffin_lim_iters,
    ).to(device)

    waveform = griffin_lim(Sxx_torch).cpu().numpy()

    return waveform

def wav_bytes_from_spectrogram_image(image: Image.Image, duration: int, nmels: int, maxvol: int, power_for_image: float) -> T.Tuple[io.BytesIO, float]:
    """
    Reconstruct a WAV audio clip from a spectrogram image. Also returns the duration in seconds.
    """

    max_volume = maxvol
    # power_for_image = 0.25
    Sxx = spectrogram_from_image(image, max_volume=max_volume, power_for_image=power_for_image)

    sample_rate = 44100  # [Hz]
    clip_duration_ms = duration  # [ms]

    bins_per_image = 512
    n_mels = nmels

    # FFT parameters
    window_duration_ms = 100  # [ms]
    padded_duration_ms = 400  # [ms]
    step_size_ms = 10  # [ms]

    # Derived parameters
    num_samples = int(image.width / float(bins_per_image) * clip_duration_ms) * sample_rate
    n_fft = int(padded_duration_ms / 1000.0 * sample_rate)
    hop_length = int(step_size_ms / 1000.0 * sample_rate)
    win_length = int(window_duration_ms / 1000.0 * sample_rate)

    samples = waveform_from_spectrogram(
        Sxx=Sxx,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        num_samples=num_samples,
        sample_rate=sample_rate,
        mel_scale=True,
        n_mels=n_mels,
        max_mel_iters=200,
        num_griffin_lim_iters=32,
    )

    wav_bytes = io.BytesIO()
    wavfile.write(wav_bytes, sample_rate, samples.astype(np.int16))
    wav_bytes.seek(0)

    duration_s = float(len(samples)) / sample_rate

    return wav_bytes, duration_s

def write_bytesio_to_file(filename, bytesio):
    """
    Write the contents of the given BytesIO to a file.
    Creates the file or overwrites the file if it does
    not exist yet.
    """
    with open(filename, "wb") as outfile:
        # Copy the BytesIO stream to the output file
        outfile.write(bytesio.getbuffer())

parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", help="Input file to process, anything that FFMPEG supports, but wav and mp3 are recommended")
parser.add_argument("-o", "--output", help="Output Image")
parser.add_argument("-d", "--duration", default=5119, help="Image duration")
parser.add_argument("-m", "--maxvol", default=100, help="Max Volume, 255 for identical results")
parser.add_argument("-p", "--powerforimage", default=0.25, help="Power for Image")
parser.add_argument("-n", "--nmels", default=512, help="n_mels to use for Image, basically width. Higher = more fidelity")
args = parser.parse_args(args=["-i", "/content/drive/MyDrive/final_image/ud_gamzedeyimdeva (1).png", "-o", "ud.wav"])

# The filename is stored in the `filename` attribute of the `args` object
filename = args.input
image = Image.open(filename)
wav_bytes, duration_s = wav_bytes_from_spectrogram_image(image, duration=int(args.duration), nmels=int(args.nmels), maxvol=int(args.maxvol), power_for_image=float(args.powerforimage))
write_bytesio_to_file(args.output, wav_bytes)

************************************************************************************************************