<a href="https://colab.research.google.com/github/JHyunjun/DQTGAN/blob/main/WavtoImage_GAN_BCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install yt-dlp
!pip install pydub
!pip install librosa

import os
import yt_dlp
import librosa
from pydub import AudioSegment
import numpy as np
import torch

# YouTube video URL
youtube_url = 'https://www.youtube.com/watch?v=I2ZEMjFJtzM'

# Download YouTube video as .wav audio file
ydl_opts = {
    'format': 'bestaudio/best',
    'outtmpl': 'downloaded_audio.%(ext)s',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
        'preferredquality': '192',
    }],
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download([youtube_url])

# If there is an error in running the youtube_dlp, please try to restart the runtime.

In [None]:

#%% Basic settings
audio_length = 4 # seconds
audio_length_ms = audio_length * 1000
data_overlap = 50 # percent
data_overlap_ps = data_overlap / 100
sampling_rate = 8192

os.makedirs("data_folder/wav_data", exist_ok=True)
os.makedirs("data_folder/mp3_data", exist_ok=True)

wav_path = "data_folder/wav_data"
mp3_path = "data_folder/mp3_data"

# Load the audio file
base_wav = AudioSegment.from_wav("downloaded_audio.wav")
audio = base_wav.set_frame_rate(sampling_rate)

# Segment the audio file and save each segment
num_segments = int(len(audio) / (audio_length_ms * data_overlap_ps))

for i in range(1, num_segments):
    tmp_fname_wav = f"audio_wav_{i:04}.wav"
    tmp_fname_mp3 = f"audio_mp3_{i:04}.mp3"
    tmp_audio = audio[(i-1)*audio_length_ms*data_overlap_ps : (i+1)*audio_length_ms*data_overlap_ps]
    tmp_audio.export(os.path.join(wav_path, tmp_fname_wav), format="wav")
    tmp_audio.export(os.path.join(mp3_path, tmp_fname_mp3), format="mp3")

# Load the segmented audio files and compute their STFT
n_fft = 512
hop_length = 128

wav_files = os.listdir(wav_path)
mp3_files = os.listdir(mp3_path)

wav_data = []
mp3_data = []

for i, file in enumerate(wav_files):
    y, sr = librosa.load(os.path.join(wav_path, file), sr=sampling_rate)
    S1 = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)
    # Apply absolute to get the magnitude
    globals()[f"wav_{i:04}"] = np.abs(S1)
    wav_data.append(np.abs(S1))

for i, file in enumerate(mp3_files):
    y, sr = librosa.load(os.path.join(mp3_path, file), sr=sampling_rate)
    S2 = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)
    # Apply absolute to get the magnitude
    globals()[f"mp3_{i:04}"] = np.abs(S2)
    mp3_data.append(np.abs(S2))

# Convert the lists to numpy arrays
wav_data = np.array(wav_data)
mp3_data = np.array(mp3_data)

wav_data = wav_data[:, :256, :256]
mp3_data = mp3_data[:, :256, :256]

# Reshape the data if necessary
wav_data = np.expand_dims(wav_data, axis=1)  # Add channel dimension for PyTorch
mp3_data = np.expand_dims(mp3_data, axis=1)  # Add channel dimension for PyTorch

# Convert numpy arrays to PyTorch tensors
wav_data = torch.tensor(wav_data).float()
mp3_data = torch.tensor(mp3_data).float()

# Normalize data to the range [-1, 1]
wav_data = (wav_data / torch.max(wav_data)) * 2 - 1
mp3_data = (mp3_data / torch.max(mp3_data)) * 2 - 1

# Create PyTorch datasets
wav_dataset = torch.utils.data.TensorDataset(wav_data)
mp3_dataset = torch.utils.data.TensorDataset(mp3_data)

In [None]:
print(wav_data.shape)
print(mp3_data.shape)

i = 5
print(wav_data[i])
print(torch.max(wav_data[i]))
print(torch.min(wav_data[i]))


In [None]:
# Using librosa.amplitude_to_db()
import matplotlib.pyplot as plt
import librosa.display

# Choose the first .wav and .mp3 file
wav_stft = globals()["wav_0000"]
mp3_stft = globals()["mp3_0000"]

# Convert amplitude to dB
wav_stft_db = librosa.amplitude_to_db(wav_stft)
mp3_stft_db = librosa.amplitude_to_db(mp3_stft)

print(wav_stft_db.shape)
print(mp3_stft_db.shape)

plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 2)
librosa.display.specshow(wav_stft_db, sr=sampling_rate, hop_length=hop_length, x_axis='time', y_axis='linear')
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram (.wav)')
plt.subplot(1, 2, 1)
librosa.display.specshow(mp3_stft_db, sr=sampling_rate, hop_length=hop_length, x_axis='time', y_axis='linear')
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram (.mp3)')
plt.tight_layout()
plt.show()

print("mp3_data.shape : ",mp3_data.shape)
print("wav_data.shape : ",wav_data.shape)

wav_stft_db = librosa.amplitude_to_db(wav_stft)

# Print the entire array
print(wav_stft_db)

# Print a specific pixel value
row = 10
col = 10
print(f"The value at row {row}, column {col} is: {wav_stft_db[row, col]}")



In [None]:
#Without liborsa.amplitude_to_db()
# Choose the first .wav and .mp3 file
wav_stft = globals()["wav_0000"]
mp3_stft = globals()["mp3_0000"]

plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 2)
librosa.display.specshow(wav_stft, sr=sampling_rate, hop_length=hop_length, x_axis='time', y_axis='linear')
plt.colorbar()
plt.title('Spectrogram (.wav)')
plt.subplot(1, 2, 1)
librosa.display.specshow(mp3_stft, sr=sampling_rate, hop_length=hop_length, x_axis='time', y_axis='linear')
plt.colorbar()
plt.title('Spectrogram (.mp3)')
plt.tight_layout()
plt.show()

# Print the entire array
print(wav_data.shape)
print(wav_data[0])  # assuming the first element corresponds to "wav_0000"

# Print a specific pixel value
row = 10
col = 10
print(f"The value at row {row}, column {col} is: {wav_data[0][0][row, col]}")  # 0 for the first dimension (batch), 0 for the second dimension (channel)



In [None]:
import torch
from torch import nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.enabled = False  # cudnn 비활성화

In [None]:
import torch
from torch import nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.enabled = False  # cudnn 비활성화

class Generator(nn.Module):
    def __init__(self):
        super().__init__()

        self.model = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.ConvTranspose2d(512, 1, kernel_size=3, stride=1, padding=1),  # 256x256 크기 이미지 생성
            nn.Tanh()
        )

    def forward(self, x):
        #print("Generator Input : ", x.shape)
        x = self.model(x)
        #print("Generator Output : ", x.shape)
        return x


class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()

        self.model = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=4, stride=2, padding=1),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(512, 1, kernel_size=4, stride=1, padding=0),  # 4x4 크기로 변환
            nn.Flatten(),
            nn.Linear(13*13, 1),  # Fully Connected 레이어 추가
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.model(x)
        return x

# Generator와 Discriminator 초기화
generator = Generator()
discriminator = Discriminator()

# 모델들을 선택한 장치로 이동
# CUDA (GPU)를 사용할 수 있는지 확인
generator.to(device)
discriminator.to(device)


In [None]:
# Hyperparameters
batch_size = 1
lr = 0.001
num_epochs = 10

# Create PyTorch data loaders
mp3_loader = torch.utils.data.DataLoader(mp3_dataset, batch_size=batch_size, shuffle=True)
wav_loader = torch.utils.data.DataLoader(wav_dataset, batch_size=batch_size, shuffle=True)

# Loss function
criterion = nn.BCELoss()

# Optimizers
optimizer_G = optim.Adam(generator.parameters(), lr=lr)
optimizer_D = optim.Adam(discriminator.parameters(), lr=lr)

for epoch in range(num_epochs):
    for i, (mp3, wav) in enumerate(zip(mp3_loader, wav_loader)):
        # Move the data to the chosen device
        mp3, wav = mp3[0].to(device), wav[0].to(device)

        # Create the labels for the real and the fake data
        real_labels = torch.ones((mp3.size(0), 1)).to(device)
        fake_labels = torch.zeros((mp3.size(0), 1)).to(device)

        # Train the discriminator with real data
        outputs_real = discriminator(wav)
        #print("outputs_real.shape : ", outputs_real.shape)
        #print("real_labels.shape : ",real_labels.shape)
        d_loss_real = criterion(outputs_real, real_labels)
        #print("Discriminator Loss Real : ", d_loss_real)

        # Train the discriminator with fake data
        fake_images = generator(mp3)
        outputs_fake = discriminator(fake_images)
        outputs_fake = torch.sigmoid(outputs_fake)
        #print("outputs_fake.shape : ",outputs_fake.shape)
        #print("fake_labels.shape : ",fake_labels.shape)
        d_loss_fake = criterion(outputs_fake, fake_labels)
        #print("Discriminator Loss Fake : ", d_loss_fake)

        d_loss = d_loss_real + d_loss_fake
        discriminator.zero_grad()
        d_loss.backward()
        optimizer_D.step()

        # Train the generator
        fake_images = generator(mp3)
        outputs = discriminator(fake_images)
        #print("outputs.shape : ", outputs.shape)
        #print("real_labels.shape : ",real_labels.shape)
        g_loss = criterion(outputs, real_labels)
        generator.zero_grad()
        g_loss.backward()
        optimizer_G.step()
        #print("                                   F                             ")

    # Print the results for this epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], d_loss: {d_loss.item():.6f}, g_loss: {g_loss.item():.6f}')


In [None]:
import matplotlib.pyplot as plt
import librosa.display

# Function to plot spectrogram
def plot_spectrogram(spec, title):
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(librosa.power_to_db(spec, ref=np.max), y_axis='mel', x_axis='time')
    plt.colorbar(format='%+2.0f dB')
    plt.title(title)
    plt.tight_layout()

# Get generated wav data from Generator
generated_wav = generator(mp3_data)

# Convert the generated wav data to numpy array
generated_wav_np = generated_wav.detach().cpu().numpy()

# Take the first generated wav data (since batch_size is 1)
generated_wav_np = generated_wav_np[0, 0, :, :]

# Compute the Spectrogram of generated wav data
n_fft = 1024
hop_length = 256
generated_spec = np.abs(librosa.stft(generated_wav_np, n_fft=n_fft, hop_length=hop_length))

# Take the first original wav data (since batch_size is 1)
original_wav_np = mp3_data[0, 0, :, :].numpy()

# Compute the Spectrogram of original wav data
original_spec = np.abs(librosa.stft(original_wav_np, n_fft=n_fft, hop_length=hop_length))

# Plot the Spectrograms
mp3_data_np = mp3_data[0, 0, :, :].numpy()

# Compute the Spectrogram of mp3 data
mp3_spec = np.abs(librosa.stft(mp3_data_np, n_fft=n_fft, hop_length=hop_length))

# Plot the Spectrograms
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
plot_spectrogram(generated_spec, title='Generated Wav Spectrogram')
plt.subplot(1, 3, 2)
plot_spectrogram(original_spec, title='Original Wav Spectrogram')
plt.subplot(1, 3, 3)
plot_spectrogram(mp3_spec, title='MP3 Spectrogram')
plt.tight_layout()

# Show the plots
plt.show()

# Show the plots
plt.show()
