In [1]:
import os
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
import numpy as np 
import matplotlib
from transformers import AutoModelForSeq2SeqLM, T5TokenizerFast
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
def avg_wer(wer_scores, combined_ref_len):
    return float(sum(wer_scores)) / float(combined_ref_len)


def _levenshtein_distance(ref, hyp):
    m = len(ref)
    n = len(hyp)

    # special case
    if ref == hyp:
        return 0
    if m == 0:
        return n
    if n == 0:
        return m

    if m < n:
        ref, hyp = hyp, ref
        m, n = n, m

    distance = np.zeros((2, n + 1), dtype=np.int32)

    for j in range(0,n + 1):
        distance[0][j] = j

    for i in range(1, m + 1):
        prev_row_idx = (i - 1) % 2
        cur_row_idx = i % 2
        distance[cur_row_idx][0] = i
        for j in range(1, n + 1):
            if ref[i - 1] == hyp[j - 1]:
                distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
            else:
                s_num = distance[prev_row_idx][j - 1] + 1
                i_num = distance[cur_row_idx][j - 1] + 1
                d_num = distance[prev_row_idx][j] + 1
                distance[cur_row_idx][j] = min(s_num, i_num, d_num)

    return distance[m % 2][n]


def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
    if ignore_case == True:
        reference = reference.lower()
        hypothesis = hypothesis.lower()

    ref_words = reference.split(delimiter)
    hyp_words = hypothesis.split(delimiter)

    edit_distance = _levenshtein_distance(ref_words, hyp_words)
    return float(edit_distance), len(ref_words)


def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
    if ignore_case == True:
        reference = reference.lower()
        hypothesis = hypothesis.lower()

    join_char = ' '
    if remove_space == True:
        join_char = ''

    reference = join_char.join(filter(None, reference.split(' ')))
    hypothesis = join_char.join(filter(None, hypothesis.split(' ')))

    edit_distance = _levenshtein_distance(reference, hypothesis)
    return float(edit_distance), len(reference)


def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
    edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case,
                                         delimiter)

    if ref_len == 0:
        raise ValueError("Reference's word number should be greater than 0.")

    wer = float(edit_distance) / ref_len
    return wer


def cer(reference, hypothesis, ignore_case=False, remove_space=False):
    edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
                                         remove_space)

    if ref_len == 0:
        raise ValueError("Length of reference should be greater than 0.")

    cer = float(edit_distance) / ref_len
    return cer

class TextTransform:
    def __init__(self):
        self.char_map = {"а": 0, "б": 1, "в": 2, "г": 3, "д": 4, "е": 5, "ё": 6, "ж": 7, "з": 8, "и": 9, "й": 10,
                  "к": 11, "л": 12, "м": 13, "н": 14, "о": 15, "п": 16, "р": 17, "с": 18, "т": 19, "у": 20,
                  "ф": 21, "ч": 22, "ц": 23, "ш": 24, "щ": 25, "ъ": 26, "ы": 27, "ь": 28, "э": 29, "ю": 30,
                  "я": 31, "х": 32, " ": 33}

        self.index_map = {}
        for key, value in self.char_map.items():
            self.index_map[value] = key

    def text_to_int(self, text):
        int_sequence = []
        for c in text:
            ch = self.char_map[c]
            int_sequence.append(ch)
        return int_sequence

    def int_to_text(self, labels):
        string = []
        for i in labels:
            string.append(self.index_map[i])
        return ''.join(string)


train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MFCC(n_mfcc=20)
)


valid_audio_transforms = torchaudio.transforms.MFCC(n_mfcc=20)

text_transform = TextTransform()

def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, utterance) in data:
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        elif data_type == 'valid':
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else:
            raise Exception('data_type should be train or valid')
        spectrograms.append(spec)
        label = torch.Tensor(text_transform.text_to_int(utterance))
        labels.append(label)
        input_lengths.append(spec.shape[0]//3)
        label_lengths.append(len(label))
    
    spectrograms1 = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
            
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms1, labels, input_lengths, label_lengths


def GreedyDecoder(output, labels, label_lengths, blank_label=34, collapse_repeated=True):
    arg_maxes = torch.argmax(output, dim=2)
    decodes = []
    targets = []
    for i, args in enumerate(arg_maxes):
        decode = []
        targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].tolist()))
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j -1]:
                    continue
                decode.append(index.item())
        decodes.append(text_transform.int_to_text(decode))
    return decodes, targets

  "At least one mel filterbank has all zero values. "


In [3]:
class BidirectionalGRU(nn.Module):

    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
        super(BidirectionalGRU, self).__init__()

        self.BiGRU = nn.GRU(
            input_size=rnn_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=batch_first, bidirectional=True)
        self.layer_norm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_norm(x)
        x = F.gelu(x)
        x, _ = self.BiGRU(x)
        x = self.dropout(x)
        return x

In [4]:
#Поменял там, где происходит загрузка, сохраняется id звукового файла, а потом в excel файле по колонке old_id ищется текст
#И того звук и текст к нему

import pandas as pd
import librosa

file = pd.read_excel('/kaggle/input/2700-audio/OneDrive-2023-12-25/Speeches v1.xlsx')
#y = [sentence for sentence in file['text']]
y = []
dir_name = "/kaggle/input/2700-audio/OneDrive-2023-12-25/Speeches/"
files_in_dir = os.listdir(dir_name)

X = []
i = 1

for e in os.listdir("/kaggle/input/2700-audio/OneDrive-2023-12-25/Speeches/"):
    file_name = e
    for old_id in range(0, 2073):
        if file_name.startswith(str(file['old_id'][old_id]) + '.'):
            y.extend([''.join(file['text'][old_id])])
            sampl = librosa.load(dir_name + file_name, sr=16000)[0]
            sampl = sampl[np.newaxis, :]
            X.append(torch.Tensor(sampl))
            break

In [5]:
import random
pairs = list(zip(X, y))
random.Random(3016).shuffle(pairs)
X, y = zip(*pairs)

In [6]:
y[:3]

('Разнообразие кухонных блюд', 'Секс', 'Жаркое лето вызывает жажду')

In [7]:
X[:3]

(tensor([[ 5.5877e-09,  1.6280e-09, -1.3302e-10,  ..., -2.2728e-05,
          -2.2285e-05,  0.0000e+00]]),
 tensor([[ 0.0000,  0.0000,  0.0000,  ..., -0.0042,  0.0015,  0.0000]]),
 tensor([[-5.1168e-06,  5.6805e-06, -2.0654e-05,  ...,  1.8661e-05,
           9.8214e-06,  1.7692e-05]]))

In [None]:
torchaudio.save('/kaggle/working/audio.wav', X[540], 16000)

In [None]:
waveform, sample_rate = torchaudio.load('/kaggle/working/audio.wav')  # Загрузка аудиофайла
torchaudio.play(waveform, sample_rate)

In [8]:
char_map = {"а": 0, "б": 1, "в": 2, "г": 3, "д": 4, "е": 5, "ё": 6, "ж": 7, "з": 8, "и": 9, "й": 10,
            "к": 11, "л": 12, "м": 13, "н": 14, "о": 15, "п": 16, "р": 17, "с": 18, "т": 19, "у": 20,
            "ф": 21, "ч": 22, "ц": 23, "ш": 24, "щ": 25, "ъ": 26, "ы": 27, "ь": 28, "э": 29, "ю": 30,
            "я": 31, "х": 32, " ": 33}

def remove_characters(sentence):
    sentence = sentence.lower()
    sentence = sentence.replace('4', 'четыре').replace('Р-220', 'р двести двадцать').replace('6', 'шесть').replace("-", " ")
    sentence = ''.join(filter(lambda x: x in char_map, sentence))
    sentence = " ".join(sentence.split())
    return sentence

y = list(map(remove_characters, y))

In [9]:
X_train = X[:2200]
X_test = X[2200:]
y_train = y[:2200]
y_test = y[2200:]

In [10]:
from torch.utils.data import Dataset

class AudioDataset(Dataset):
    def __init__(self, audio_list, text_list):
        self.audio_list = audio_list
        self.text_list = text_list
        
    def __len__(self):
        return len(self.text_list)
    
    def __getitem__(self, index):
        audio = self.audio_list[index]
        text = self.text_list[index]
        return audio, text

In [11]:
class SpeechRecognitionModel1(nn.Module):
    def __init__(self, num_classes):
        super(SpeechRecognitionModel1, self).__init__()
        self.conv = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 32, kernel_size=(4,4), stride=(3,3), padding=(2,2)),
            nn.BatchNorm2d(32),
            nn.GELU(),
            nn.Conv2d(32, 128, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
            nn.BatchNorm2d(128),
            nn.GELU(),
            nn.Conv2d(128, 128, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
            nn.BatchNorm2d(128),
            nn.GELU(),
        )
        
        self.fc_1 = nn.Sequential(
            nn.Linear(896, 270),
            nn.LayerNorm(270),
            nn.GELU(),
            nn.Linear(270, 270),
            nn.LayerNorm(270),
            nn.GELU(),
            nn.Linear(270, 270),
            nn.LayerNorm(270),
            nn.GELU(),
        )
        
        self.BiGRU_1 = BidirectionalGRU(270, 270, 0, True)
        self.BiGRU_2 = BidirectionalGRU(540, 270, 0, True)
        self.BiGRU_3 = BidirectionalGRU(540, 270, 0, True)
        self.BiGRU_4 = BidirectionalGRU(540, 270, 0.5, True)
        
        self.fc_2 = nn.Sequential(
            nn.Linear(540, num_classes),
        )
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, x):
        x = self.conv(x)
        x = x.permute(0, 3, 1, 2)
        x = x.view(x.size(0), x.size(1), -1)
        x = self.fc_1(x)
        x = self.BiGRU_1(x)
        x = self.BiGRU_2(x)
        x = self.BiGRU_3(x)
        x = self.BiGRU_4(x)
        x = self.fc_2(x)
        x = self.softmax(x)
        return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Зададим название выбронной модели из хаба
MODEL_NAME = 'UrukHan/t5-russian-spell'
MAX_INPUT = 256

# Загрузка модели и токенизатора
tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
corrector = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

In [12]:
class IterMeter(object):
    def __init__(self):
        self.val = 0

    def step(self):
        self.val += 1

    def get(self):
        return self.val


def train(model, device, train_loader, criterion, optimizer, scheduler, epoch, iter_meter):
    model.train()
    train_loss = 0
    train_cer, train_wer = [], []
    data_len = len(train_loader.dataset)
    for batch_idx, _data in enumerate(train_loader):
        spectrograms, labels, input_lengths, label_lengths = _data 
        spectrograms, labels = spectrograms.to(device), labels.to(device)

        optimizer.zero_grad()

        output = model(spectrograms) 
        output = output.transpose(0, 1)

        loss = criterion(output, labels, input_lengths, label_lengths)
        train_loss += loss.item() / len(train_loader)
        loss.backward()

        optimizer.step()
        scheduler.step()
        iter_meter.step()
        if batch_idx % 20 == 0 or batch_idx == data_len:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(spectrograms), data_len,
                100. * batch_idx / len(train_loader), loss.item()))
            
        """decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
        for j in range(len(decoded_preds)):
            train_cer.append(cer(decoded_targets[j], decoded_preds[j]))
            train_wer.append(wer(decoded_targets[j], decoded_preds[j]))
    
    avg_cer = sum(train_cer)/len(train_cer)
    avg_wer = sum(train_wer)/len(train_wer)
            
    print('Train set:\tAverage loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'
          .format(train_loss, avg_cer, avg_wer))"""
            
    

def test(model, device, test_loader, criterion, epoch, iter_meter):
    print('\nevaluating...')
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []
    with torch.no_grad():
        for i, _data in enumerate(test_loader):
            spectrograms, labels, input_lengths, label_lengths = _data 
            spectrograms, labels = spectrograms.to(device), labels.to(device)
            
            output = model(spectrograms)
            output = output.transpose(0, 1)
            
            loss = criterion(output, labels, input_lengths, label_lengths)
            test_loss += loss.item() / len(test_loader)
            
            decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
            for j in range(len(decoded_preds)):
                test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                test_wer.append(wer(decoded_targets[j], decoded_preds[j]))
    
   
    avg_cer = sum(test_cer)/len(test_cer)
    avg_wer = sum(test_wer)/len(test_wer)

    median_cer = np.median(np.array(test_cer))
    median_wer = np.median(np.array(test_wer))
           
    print('Test set:\tAverage loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'
          .format(test_loss, avg_cer, avg_wer, median_cer, median_wer))
    

def main(learning_rate=5e-4, batch_size=20, epochs=10):

    hparams = {
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(7)
    device = torch.device("cuda" if use_cuda else "cpu")

    train_dataset = AudioDataset(X_train, y_train)
    test_dataset = AudioDataset(X_test, y_test)

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = data.DataLoader(dataset=train_dataset,
                                batch_size=hparams['batch_size'],
                                shuffle=True,
                                collate_fn=lambda x: data_processing(x, 'train'),
                                **kwargs)
    test_loader = data.DataLoader(dataset=test_dataset,
                                batch_size=hparams['batch_size'],
                                shuffle=False,
                                collate_fn=lambda x: data_processing(x, 'valid'),
                                **kwargs)

    model = SpeechRecognitionModel1(35).to(device)

    print(model)
    print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

    optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
    criterion = nn.CTCLoss(blank=34).to(device)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                            steps_per_epoch=int(len(train_loader)),
                                            epochs=hparams['epochs'],
                                            anneal_strategy='linear')
    
    iter_meter = IterMeter()
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, criterion, optimizer, scheduler, epoch, iter_meter)
        test(model, device, test_loader, criterion, epoch, iter_meter)
        
    torch.save(model, '/kaggle/working/model_for_correction_test.pt')

In [13]:
#накрутить сюда корректор ошибок, обучение без него
def predict(model, file_name, device):
    model.eval()
    spectro = []
    valid_audio_transforms = torchaudio.transforms.MFCC(n_mfcc=20)
    
    sampl = librosa.load(file_name, sr=16000)[0]
    sampl = sampl[np.newaxis, :]
    sampl = torch.Tensor(sampl)
    spectr = valid_audio_transforms(sampl).squeeze(0)
    spectrogram_tensor = spectr.unsqueeze(0).unsqueeze(0)
    
    print(spectrogram_tensor.size())

    with torch.no_grad():
        spectrogram_tensor.to(device)
        output = model(spectrogram_tensor)
        print(output.size())
        
        arg_maxes = torch.argmax(output, dim=2)
        decodes = []
        for i, args in enumerate(arg_maxes):
            decode = []
            for j, index in enumerate(args):
                if index != 34:
                    if True and j != 0 and index == args[j -1]:
                        continue
                    decode.append(index.item())
            decodes.append(text_transform.int_to_text(decode))

    return decodes[0]

In [14]:
#накрутить сюда корректор ошибок, обучение без него
def predict_with_tensor(model, sampl):
    needed_device = torch.device("cpu")
    model.eval()
    spectro = []
    valid_audio_transforms = torchaudio.transforms.MFCC(n_mfcc=20)
    
    #sampl = librosa.load(file_name, sr=16000)[0]
    #sampl = sampl[np.newaxis, :]
    #sampl = torch.Tensor(sampl)
    spectr = valid_audio_transforms(sampl).squeeze(0)
    spectrogram_tensor = spectr.unsqueeze(0).unsqueeze(0)
    
    with torch.no_grad():
        spectrogram_tensor.to(needed_device)
        output = model(spectrogram_tensor)
        
        arg_maxes = torch.argmax(output, dim=2)
        decodes = []
        for i, args in enumerate(arg_maxes):
            decode = []
            for j, index in enumerate(args):
                if index != 34:
                    if True and j != 0 and index == args[j -1]:
                        continue
                    decode.append(index.item())
            decodes.append(text_transform.int_to_text(decode))
            
    #print(decodes[0])        
    input_sequences = decodes[0]
                
    task_prefix = "Spell correct: "

    if type(input_sequences) != list: input_sequences = [input_sequences]
    encoded = tokenizer(
      [task_prefix + sequence for sequence in input_sequences],
      padding="longest",
      max_length=MAX_INPUT,
      truncation=True,
      return_tensors="pt",
    )

    predicts = corrector.generate(**encoded.to(needed_device))   # # Прогнозирование

    input_sequences = tokenizer.batch_decode(predicts, skip_special_tokens=True)[0]
    input_sequences = remove_characters(input_sequences)

    return input_sequences

    #return decodes[0]

In [15]:
import hunspell
import os

#пробуем hunspell для исправления ошибок
def load_hunspell_russian_dict():
    dict_path = "/kaggle/input/dop-test-files"  
    
    ru_dic = os.path.join(dict_path, "ru_RU_big.dic")
    ru_aff = os.path.join(dict_path, "ru_RU_big.aff")
    
    # Create Hunspell instance for Russian
    hunspell_instance = hunspell.HunSpell(ru_dic, ru_aff)
    return hunspell_instance

def correct_mistakes(text, hunspell_instance):
    corrected_text = []
    words = text.split()
    
    for word in words:
        if hunspell_instance.spell(word):
            corrected_text.append(word)
        else:
            suggestions = hunspell_instance.suggest(word)
            if suggestions:
                corrected_text.append(suggestions[0])  # Choose the first suggestion
            else:
                corrected_text.append(word)  # No suggestion, keep the original word
    
    return " ".join(corrected_text)

# Example usage
hunspell_instance = load_hunspell_russian_dict()
#text = "Привет, как дила?"
#corrected_text = correct_mistakes(text, hunspell_instance)
#print(corrected_text)

def predict_with_tensor_v2(model, sampl):
    needed_device = torch.device("cpu")
    model.eval()
    spectro = []
    valid_audio_transforms = torchaudio.transforms.MFCC(n_mfcc=20)
    
    spectr = valid_audio_transforms(sampl).squeeze(0)
    spectrogram_tensor = spectr.unsqueeze(0).unsqueeze(0)
    
    with torch.no_grad():
        spectrogram_tensor.to(needed_device)
        output = model(spectrogram_tensor)
        
        arg_maxes = torch.argmax(output, dim=2)
        decodes = []
        for i, args in enumerate(arg_maxes):
            decode = []
            for j, index in enumerate(args):
                if index != 34:
                    if True and j != 0 and index == args[j -1]:
                        continue
                    decode.append(index.item())
            decodes.append(text_transform.int_to_text(decode))
            
    #print(decodes[0])        
    corrected_output = correct_mistakes(decodes[0], hunspell_instance)

    #return decodes[0]
    return corrected_output

In [None]:
%%time 
learning_rate = 0.002
batch_size = 20
epochs = 160

main(learning_rate, batch_size, epochs)

In [16]:
#use_cuda = torch.cuda.is_available()
#device = torch.device("cpu")
needed_device = torch.device("cpu")
model = torch.load('/kaggle/input/dop-test-files/model_for_making_dataset_v7(3016).pt', map_location=torch.device('cpu'))

#1543 1882 1372

model.to(needed_device)
print(needed_device)
#predict(model, '/kaggle/input/upd-speech/mono_voice/1964.wav', device)

cpu


In [17]:
d = {'X_test': X_test, 'label': y_test}
df_test = pd.DataFrame(data=d)
df_test.head(5)

Unnamed: 0,X_test,label
0,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",отмечать
1,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",тапки не стоит оставлять на улице
2,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",зверёк
3,"[[tensor(2.0789e-06), tensor(-3.5355e-06), ten...",любовь к природе
4,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",вселенная бесконечна действительно


In [18]:
y_test[:5]

['отмечать',
 'тапки не стоит оставлять на улице',
 'зверёк',
 'любовь к природе',
 'вселенная бесконечна действительно']

In [19]:
def count_test_cer(row, model):
    prediction = predict_with_tensor_v2(model, row['X_test'])
    return cer(row['label'], prediction)

In [20]:
def count_test_wer(row, model):
    prediction = predict_with_tensor_v2(model, row['X_test'])
    return wer(row['label'], prediction)

In [21]:
def write_preds(row, model):
    return predict_with_tensor_v2(model, row['X_test'])

In [22]:
df_test['CER'] = df_test.apply(count_test_cer, axis=1, model = model)

  "At least one mel filterbank has all zero values. "


In [23]:
df_test['WER'] = df_test.apply(count_test_wer, axis=1, model = model)

In [24]:
df_test['preds'] = df_test.apply(write_preds, axis=1, model = model)

In [25]:
df_test.loc[df_test['CER'] > 0]

Unnamed: 0,X_test,label,CER,WER,preds
1,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",тапки не стоит оставлять на улице,0.151515,0.500000,тапке не соти оставлять на улицы
2,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",зверёк,0.166667,1.000000,зверок
3,"[[tensor(2.0789e-06), tensor(-3.5355e-06), ten...",любовь к природе,0.312500,0.666667,любовь периоды
4,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",вселенная бесконечна действительно,0.029412,0.333333,вселенная бесконечно действительно
5,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",поворот к парку развлечений,0.222222,0.500000,повадка парку развлечений
...,...,...,...,...,...
592,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",там начальство всё время меняется,0.030303,0.200000,там начальство все время меняется
593,"[[tensor(4.6664e-06), tensor(-2.2184e-05), ten...",улыбка залог успеха в любых начинаниях,0.526316,1.000000,лыка зелу узка нахлобучивания
594,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",тонкие линии на бумаге,0.090909,0.750000,тонкие лини на б маге
595,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",по коже и волосам видно всё о человеке,0.157895,0.625000,покое и волосам видно все чело веке


In [26]:
df_test

Unnamed: 0,X_test,label,CER,WER,preds
0,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",незабываемое путешествие на море,0.031250,0.500,не забываемое путешествие на море
1,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",красивый закат над океаном,0.038462,0.250,красивый закат нал океаном
2,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",водоросли,0.666667,1.000,водораспыление
3,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",картина человека с отслеживающими глазами,0.341463,0.800,карт человеко-час оттесывая щеми глазами
4,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",тело то здорово а что делать с остальным,0.125000,0.375,тела то здорово что дела с остальным
...,...,...,...,...,...
592,"[[tensor(-7.3563e-07), tensor(-7.0477e-07), te...",успех,0.600000,1.000,пустее
593,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",дорога перекрыта,0.000000,0.000,дорога перекрыта
594,"[[tensor(-8.3037e-10), tensor(-9.6717e-10), te...",на улице бессмысленно все,0.040000,0.250,на улице бессмысленно вся
595,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...",путь наш лежит за горизонт,0.038462,0.200,пут наш лежит за горизонт


In [None]:
#ИСПОЛЬЗОВАЛ МАЛЫЙ СЛОВАРЬ

In [None]:
#using model(3024_seed_data).pt with hunspell. WITHOUT HUNSPELL: CER = Average CER: 0.166791 Average WER: 0.6451

In [None]:
df_test['CER'].mean()

In [None]:
df_test['WER'].mean()

In [None]:
#using model_for_making_dataset_v4(1242).pt with hunspell. WITHOUT HUNSPELL: Average CER ~ 0.16 Average WER ~ 0.64
print('CER: ', df_test['CER'].mean())
print('WER: ', df_test['WER'].mean())

In [None]:
#using model_for_making_dataset_v5(2204).pt with hunspell. WITHOUT HUNSPELL: Average CER ~ 0.16 Average WER ~ 0.64
print('CER: ', df_test['CER'].mean())
print('WER: ', df_test['WER'].mean())

In [None]:
#using model_for_making_dataset_v6(20024).pt with hunspell. WITHOUT HUNSPELL: Average CER ~ 0.16 Average WER ~ 0.64
print('CER: ', df_test['CER'].mean())
print('WER: ', df_test['WER'].mean())

In [None]:
#using model_for_making_dataset_v7(3016).pt with hunspell. WITHOUT HUNSPELL: Average CER ~ 0.16 Average WER ~ 0.64
print('CER: ', df_test['CER'].mean())
print('WER: ', df_test['WER'].mean())

In [None]:
#ДАЛЕЕ ИСПОЛЬЗУЕТСЯ БОЛЬШИЙ СЛОВАРЬ

In [None]:
#using model_for_making_dataset_v4(1242).pt with hunspell. WITHOUT HUNSPELL: Average CER ~ 0.16 Average WER ~ 0.64
print('CER: ', df_test['CER'].mean())
print('WER: ', df_test['WER'].mean())

In [27]:
#using model_for_making_dataset_v5(2204).pt with hunspell. WITHOUT HUNSPELL: Average CER ~ 0.16 Average WER ~ 0.64
print('CER: ', df_test['CER'].mean())
print('WER: ', df_test['WER'].mean())

CER:  0.22032562029369046
WER:  0.5630099753215332


In [26]:
#using model_for_making_dataset_v6(20024).pt with hunspell. WITHOUT HUNSPELL: Average CER ~ 0.16 Average WER ~ 0.64
print('CER: ', df_test['CER'].mean())
print('WER: ', df_test['WER'].mean())

CER:  0.2183401687348701
WER:  0.5742835565699888


In [26]:
#using model_for_making_dataset_v7(3016).pt with hunspell. WITHOUT HUNSPELL: Average CER ~ 0.16 Average WER ~ 0.64
print('CER: ', df_test['CER'].mean())
print('WER: ', df_test['WER'].mean())

CER:  0.20849510905259327
WER:  0.5569558349709104


In [None]:
y_test

In [None]:
torch.save(model.state_dict(), '/kaggle/working/model.pth')

In [None]:
import wave

def get_wav_duration(directory):
    total_duration = 0
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            filepath = os.path.join(directory, filename)
            with wave.open(filepath, 'r') as wav_file:
                frames = wav_file.getnframes()
                rate = wav_file.getframerate()
                duration = frames / float(rate)
                total_duration += duration
    return total_duration

directory = '/kaggle/input/upd-speech/mono_voice'
total_duration = get_wav_duration(directory)
print('Total duration of WAV files:', total_duration, 'seconds')

In [None]:
def format_time(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return '{:02d}:{:02d}:{:02d}'.format(int(hours), int(minutes), int(seconds))
seconds = 3661
formatted_time = format_time(total_duration)
print(formatted_time)  # Output: '01:01:01'