In [None]:
import os
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
import numpy as np 
import matplotlib

In [None]:
import pandas as pd
import librosa

file = pd.read_excel('/kaggle/input/rus-speech/Speeches.xlsx')
y = [sentence for sentence in file['Русская речь']]

dir_name = "/kaggle/input/upd-speech/mono_voice/"
files_in_dir = os.listdir(dir_name)

X = []
i = 1

for e in range(1, 2001):
    file_name = f'{e}.wav'
    sampl = librosa.load(dir_name + file_name, sr=16000)[0]
    sampl = sampl[np.newaxis, :]
    X.append(torch.Tensor(sampl))
    if i % 100 == 0:
        print(i)
    i += 1

In [None]:
type(X[0])

In [None]:
char_map = {"а": 0, "б": 1, "в": 2, "г": 3, "д": 4, "е": 5, "ё": 6, "ж": 7, "з": 8, "и": 9, "й": 10,
            "к": 11, "л": 12, "м": 13, "н": 14, "о": 15, "п": 16, "р": 17, "с": 18, "т": 19, "у": 20,
            "ф": 21, "ч": 22, "ц": 23, "ш": 24, "щ": 25, "ъ": 26, "ы": 27, "ь": 28, "э": 29, "ю": 30,
            "я": 31, "х": 32, " ": 33}

def remove_characters(sentence):
    sentence = sentence.lower()
    sentence = sentence.replace('4', 'четыре').replace('Р-220', 'р двести двадцать').replace('6', 'шесть').replace("-", " ")
    sentence = ''.join(filter(lambda x: x in char_map, sentence))
    sentence = " ".join(sentence.split())
    return sentence

y = list(map(remove_characters, y))

In [None]:
"""from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train = X[:1800]
X_test = X[1800:]
y_train = y[:1800]
y_test = y[1800:]"""

In [None]:
X_train = X
X = []

In [None]:
for tensor in X_train:
    tensor = tensor.squeeze()
    X.append(tensor)

In [None]:
max_tensor = max(X, key=lambda x: x.numel())

print(max_tensor.shape)

In [None]:
type(X[0])

In [None]:
X[0]

In [None]:
import torch.nn.functional as F

# Задаем желаемый размер
desired_size = (213830,)
resized_tensors = []

for tensor in X:
    # Рассчитываем разницу в размерах
    padding = [0] * (len(desired_size) * 2)  # padding = [0, 0, 0, 0, 0, 0]
    padding[:len(desired_size)] = [(desired_size[i] - tensor.shape[i]) for i in range(len(desired_size))]  # padding[:1] = [(6 - 3) // 2] = [1]

    # Применяем padding
    padded_tensor = F.pad(tensor, padding, value=0)
    resized_tensors.append(padded_tensor)
    
X = resized_tensors

In [None]:
X[5].shape

In [None]:
from torch.utils.data import Dataset

class AudioDataset(Dataset):
    def __init__(self, audio_list, text_list):
        self.audio_list = audio_list
        self.text_list = text_list
        
    def __len__(self):
        return len(self.text_list)
    
    def __getitem__(self, index):
        audio = self.audio_list[index]
        text = self.text_list[index]
        return audio, text

In [None]:
train_dataset = AudioDataset(X, y)
train_dataset

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Определение генератора
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.fc = nn.Linear(input_dim, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 512)
        self.fc3 = nn.Linear(512, output_dim)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.relu(self.fc(x))
        x = self.relu(self.fc2(x))
        x = self.tanh(self.fc3(x))
        return x

# Определение дискриминатора
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.fc = nn.Linear(input_dim, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [None]:
# Функция обучения GAN
def train_gan(generator, discriminator, dataloader, num_epochs, device):
    criterion = nn.BCELoss()
    g_optimizer = optim.Adam(generator.parameters(), lr=0.0002)
    d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0002)

    for epoch in range(num_epochs):
        for i, real_data in enumerate(dataloader):
            
            """real_data = []
            # Преобразовываем каждый элемент в тензор и перемещаем на устройство
            for item in dataloader:
                item = item.to(device)
                real_data.append(item)"""
            print(type(real_data))
            
            real_data = real_data.to(device)

            # Обучение дискриминатора
            d_optimizer.zero_grad()
            real_labels = torch.ones(real_data.size(0), 1).to(device)
            fake_labels = torch.zeros(real_data.size(0), 1).to(device)

            # Пропуск реальных данных через дискриминатор
            real_outputs = discriminator(real_data)
            d_loss_real = criterion(real_outputs, real_labels)

            # Генерация фейковых данных и пропуск их через дискриминатор
            noise = torch.randn(real_data.size(0), 100).to(device)
            fake_data = generator(noise)
            fake_outputs = discriminator(fake_data.detach())
            d_loss_fake = criterion(fake_outputs, fake_labels)

            # Общая потеря дискриминатора
            d_loss = d_loss_real + d_loss_fake
            d_loss.backward()
            d_optimizer.step()

            # Обучение генератора
            g_optimizer.zero_grad()
            fake_outputs = discriminator(fake_data)
            g_loss = criterion(fake_outputs, real_labels)
            g_loss.backward()
            g_optimizer.step()

            # Вывод промежуточной информации
            if (i + 1) % 100 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}],"
                      f" d_loss: {d_loss.item():.4f}, g_loss: {g_loss.item():.4f}")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Параметры генератора и дискриминатора
input_dim = 100  # Размерность входного шумового вектора
output_dim = 213830  # Размерность выходных данных (зависит от аудиоформата)

# Создание генератора и дискриминатора
generator = Generator(input_dim, output_dim).to(device)
discriminator = Discriminator(output_dim).to(device)

# Загрузка аудиоданных
dataset = train_dataset  # Замените на свой собственный класс Dataset
#dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Обучение GAN
num_epochs = 10
train_gan(generator, discriminator, dataset, num_epochs, device)

In [None]:
dataloader.batch_size

In [None]:
import wave

def get_wav_duration(directory):
    total_duration = 0
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            filepath = os.path.join(directory, filename)
            with wave.open(filepath, 'r') as wav_file:
                frames = wav_file.getnframes()
                rate = wav_file.getframerate()
                duration = frames / float(rate)
                total_duration += duration
    return total_duration

directory = '/kaggle/input/upd-speech/mono_voice'
total_duration = get_wav_duration(directory)
print('Total duration of WAV files:', total_duration, 'seconds')

In [None]:
def format_time(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return '{:02d}:{:02d}:{:02d}'.format(int(hours), int(minutes), int(seconds))
seconds = 3661
formatted_time = format_time(total_duration)
print(formatted_time)  # Output: '01:01:01'