<a href="https://colab.research.google.com/github/JHyunjun/DQTGAN/blob/main/230715_DQTGAN_DNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Created by Hunjun, JANG
# Recent revision date : 23.07.15
# DQT-GAN(Data Quality Transformation-Generative Adversarial Network)

!pip install pytube
!pip install pydub
!pip install librosa

%cd /content/drive/MyDrive/Colab Notebooks/GAN/DQT-GAN/Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Check the Path
! pwd

In [None]:
from pytube import YouTube
from pydub import AudioSegment
import librosa
import soundfile as sf
import numpy as np
import os
import matplotlib.pyplot as plt

# Youtube url
url = 'https://www.youtube.com/watch?v=83EzIW3MbAI'

# Download the url video
yt = YouTube(url)
stream = yt.streams.filter(only_audio=True).first()
stream.download(filename='temp.mp4')  # save it as 'temp'

# mp4 to wav
audio = AudioSegment.from_file('temp.mp4')

# Video slicing
audio = audio[2*60*1000:5*60*1000]
#audio = audio[2*60*1000:2*60*1000+4*1000]
audio.export('audio.wav', format='wav')

# Slicing the 4s, 8kHz sampling rate
y_8k, sr_8k = librosa.load('audio.wav', sr=8000)  # Load audio file at 8kHz
y_44k, sr_44k = librosa.load('audio.wav', sr=44100)
os.makedirs('slices', exist_ok=True)

# Save the first 4s video
first_slice_8k = y_8k[0:sr_8k*4]
first_slice_44k = y_44k[0:sr_44k*4]
sf.write('slices/slice_0_8k.wav', first_slice_8k, sr_8k)
sf.write('slices/slice_0_44k.wav', first_slice_44k, sr_44k)

# plotting the first wav sequence (8kHz)
plt.figure(figsize=(10, 4))
plt.plot(first_slice_8k)
plt.ylabel('Amplitude')
plt.xlabel('Sample index')
plt.title('8kHz Waveform of the first 4-second audio')
plt.show()

# plotting the first wav sequence (44.1kHz)
plt.figure(figsize=(10, 4))
plt.plot(first_slice_44k)
plt.ylabel('Amplitude')
plt.xlabel('Sample index')
plt.title('44.1kHz Waveform of the first 4-second audio')
plt.show()

# delete the temporary data
# os.remove('temp.mp4')
# os.remove('audio.wav')


In [None]:
import torch
import torch.nn as nn
import librosa
import numpy as np
import os

# Define DNN Model
class AudioUpsampler(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(AudioUpsampler, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, hidden_size)
        self.fc6 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.fc4(x)
        x = self.fc5(x)
        x = self.fc6(x)
        return x

# Load Training Data
input_data_8k, _ = librosa.load('slices/slice_0_8k.wav', sr=8000)
output_data_44k, _ = librosa.load('slices/slice_0_44k.wav', sr=44100)

# Pad or Trim the Data to Match Sizes
target_size = max(len(input_data_8k), len(output_data_44k))
input_data_8k = np.pad(input_data_8k, (0, target_size - len(input_data_8k)), mode='constant')
output_data_44k = np.pad(output_data_44k, (0, target_size - len(output_data_44k)), mode='constant')

# Data Dimension Transformation
input_data_8k = torch.Tensor(input_data_8k).view(-1, 1)
output_data_44k = torch.Tensor(output_data_44k).view(-1, 1)

# Set Hyperparameters
input_size = 1
hidden_size = 128
output_size = 1
learning_rate = 0.001
num_epochs = 100

# Create DNN Model
model = AudioUpsampler(input_size, hidden_size, output_size)

# Define Loss Function and Optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training
for epoch in range(num_epochs):
    model.zero_grad()
    output = model(input_data_8k)
    loss = criterion(output, output_data_44k)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

# Save the Trained Model
torch.save(model.state_dict(), 'dnn_model.pt')


In [None]:
import matplotlib.pyplot as plt

# Validate
model.eval()  # set the model to evaluation mode
with torch.no_grad():
    predicted_output_44k = model(input_data_8k)

# Convert tensors to numpy arrays for plotting
input_data_8k_np = input_data_8k.numpy()
predicted_output_44k_np = predicted_output_44k.numpy()
output_data_44k_np = output_data_44k.numpy()

# Create time axes for the audio signals
time_8k = np.linspace(0, len(input_data_8k_np) / 8000, len(input_data_8k_np))
time_44k = np.linspace(0, len(output_data_44k_np) / 44100, len(output_data_44k_np))

# Plot
plt.figure(figsize=(12, 8))

# Plot 8kHz input
plt.subplot(3, 1, 1)
plt.plot(time_8k, input_data_8k_np, label='8kHz input')
plt.legend()

# Plot predicted 44kHz output
plt.subplot(3, 1, 2)
plt.plot(time_44k, predicted_output_44k_np, label='Predicted 44kHz output')
plt.legend()

# Plot actual 44kHz output
plt.subplot(3, 1, 3)
plt.plot(time_44k, output_data_44k_np, label='Actual 44kHz output')
plt.legend()

plt.show()
