In [1]:
import numpy as np
from datasets import load_dataset
import warnings

from torch import optim

warnings.filterwarnings("ignore")

In [2]:
datasets_tr = load_dataset("covost2", "tr_en", data_dir="Datasets/STT_Datasets/tr")

Using custom data configuration tr_en-7cf12dd05348fb65
Reusing dataset covost2 (C:\Users\Huawei\.cache\huggingface\datasets\covost2\tr_en-7cf12dd05348fb65\1.0.0\bba950aae1ffa5a14b876b7e09c17b44de2c3cf60e7bd5d459640beffc78e35b)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
data = datasets_tr["train"][:1]

In [4]:
data

{'client_id': ['1c4f1371075f6e9870c2523bc68067c9fbf0a8c3862ac1480fe4c2ebfefa3e5f1f94d6c60e0e02b07eda2effdb6515b2a20053476b39bafa8746e25ac784ef67'],
 'file': ['D:/ROBOT-V0/chatbot/Speech2Text/Datasets/STT_Datasets/tr/clips/common_voice_tr_18756242.mp3'],
 'audio': [{'path': 'D:/ROBOT-V0/chatbot/Speech2Text/Datasets/STT_Datasets/tr/clips/common_voice_tr_18756242.mp3',
   'array': array([ 3.9850403e-14, -1.4853513e-16,  1.2445653e-13, ...,
          -3.5560884e-08, -8.3746488e-07,  5.5270289e-07], dtype=float32),
   'sampling_rate': 16000}],
 'sentence': ['Bunda başarı sağlandı gibi de görünüyor.'],
 'translation': ['It appears that this has been succeeded.'],
 'id': ['common_voice_tr_18756242']}

In [5]:
import librosa
import numpy as np
import torch

def audio_transformer(audio_path):
    audio, sr = librosa.load(audio_path, sr=16000)
    audio = librosa.util.normalize(audio)
    audio = librosa.feature.melspectrogram(audio)
    audio = librosa.power_to_db(audio, ref=np.max)
    tensor = torch.from_numpy(audio).T
    tensor = tensor.unsqueeze(0)
    return tensor

In [6]:
alphabet = "abcdefghijklmnopqrstuvwxyz, "
SOS_token = 0
EOS_token = 1

def label_processing(label):
    label = label.lower()
    tensor = torch.zeros(len(label))
    for i,c in enumerate(label):
        try:
            index = alphabet.index(c)+2
        except:
            if c==".":
                index = EOS_token
        finally:
            tensor[i] = torch.tensor([index])
    return tensor.unsqueeze(0).long()

In [7]:
train_x_list,train_y_list=[],[]
for path,label in zip(data["file"],data["translation"]):
    train_x_list.append(audio_transformer(path))
    train_y_list.append(label_processing(label))

In [8]:
import torch.nn as nn
import torch.nn.functional as F

In [9]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size,layer_size, p):
        super(Encoder, self).__init__()
        
        self.drop = nn.Dropout(p)
        self.rnn = nn.LSTM(input_size, hidden_size,layer_size, batch_first=True, dropout=p)
    def forward(self, x):
        output = self.drop(x)
        output, hidden = self.rnn(output)
        return output, hidden

In [10]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        
        self.EO = nn.Linear(hidden_size, hidden_size)
        self.Hi = nn.Linear(hidden_size*2, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)
    def forward(self, keys,query):
        hidden_state = query[0].permute(1,0,2)
        cell_state = query[1].permute(1,0,2)
        
        hidden_state = torch.cat((hidden_state, cell_state), dim=-1)
        scores = self.Va(torch.tanh(self.Hi(hidden_state) + self.EO(keys)))
        
        weights = F.softmax(scores, dim=1)
        context = torch.sum(torch.mul(weights,keys), dim=1).unsqueeze(1)
        
        return context

In [11]:
class Decoder(nn.Module):
    def __init__(self,hidden_size,output_size,layer_size,p):
        super(Decoder, self).__init__()
        
        self.drop = nn.Dropout(p)
        self.embedding = nn.Embedding(output_size,hidden_size)
        self.rnn = nn.LSTM(hidden_size*2, hidden_size,layer_size, batch_first=True, dropout=p)
        self.out = nn.Linear(hidden_size, output_size)
        
        self.attention = Attention(hidden_size)
        
    def forward(self,decoder_input, decoder_hidden, encoder_output):
        output = self.drop(self.embedding(decoder_input))
        
        context = self.attention(encoder_output, decoder_hidden)
        input_rnn = torch.cat((output, context), dim=-1)
        
        output, hidden = self.rnn(input_rnn,decoder_hidden)
        output = self.out(output)
        
        return output, hidden

In [12]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size,layer_size, output_size, p, max_length):
        super(Model, self).__init__()
        
        self.max_length = max_length
        self.drop = nn.Dropout(p)
        self.encoder = Encoder(input_size, hidden_size,layer_size,p)
        self.decoder = Decoder(hidden_size,output_size,layer_size,p)
        
    def forward(self,x,target=None):
        encoder_output,encoder_hidden = self.encoder(x)
        
        decoder_outputs = []
        
        decoder_input = torch.empty(x.size(0),1,dtype=torch.long).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        
        if target is not None:
            max_length = target.size(1)
        else:
            max_length = self.max_length
        
        for i in range(max_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_output)
            decoder_outputs.append(decoder_output)
            if target is not None:
                decoder_input = target[:, i].unsqueeze(1)
            else:
                topv,topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(1).detach()
            
                if decoder_input == EOS_token:
                    break

        decoder_outputs = torch.cat(decoder_outputs,1)
        decoder_outputs = F.log_softmax(decoder_outputs,dim=1)
        return decoder_outputs

In [26]:
model = Model(128,256,1,len(alphabet)+2,0.2,100)

In [27]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=4e-5,weight_decay=1e-5)
epochs = 1000

In [28]:
for e in range(epochs):
    for i,(x,y) in enumerate(zip(train_x_list,train_y_list)):
        optimizer.zero_grad()
        output = model(x,y).squeeze(0)
        loss = criterion(output,y.squeeze(0))
        loss.backward()
        optimizer.step()
        prediction = ""
        for i in range(len(output)):
            value = torch.argmax(output[i])
            prediction += alphabet[value-2]
        print(loss.item(),prediction)

3.6957714557647705 ykwkc,,cixxkhrtefu ozldzzlt e gyaahpp,p,
3.6986641883850098 kkbkc,cbbvvguijmfueezzf igtp  llgghpp,p,
3.694556713104248 kkblb,c ixvluvmxgu ehffibl   llltghppnpn
3.689216136932373 kkukc,, ,vizudthmhfezooojzl e lraghpp,pn
3.6745636463165283 kbbkkl,ecmbluhjhfhiurooixlt   llgdhpp, n
3.6721434593200684 bkblb,,ptvzzuiimfaiurdoqrzpep llhhhpp,p,
3.6649317741394043 kkkmb,, gvvzuhihth ujzfirdt plllhghpp,p,
3.663952589035034 kkkkc,,bbvaluvimahaeazozxltee lrgzhppnn,
3.6421966552734375 kbbkc,febxblnffhxuiexzfmyltpp llgzh,e, ,
3.648900270462036 bbk,bi,c,vbztviehh huzffzzzne llashpdn ,
3.628690004348755 bbbkc,,ppvalu ruauferziillleedfldzhdd, ,
3.6158854961395264 kkkyt,ce,sxzuii mhiorwfrrztep lldhhpp,d,
3.6188838481903076 kkkkt,ce,milu , xuiezzfxxxye  lyaggppe  
3.607414960861206 kkkkcicccsbouvmhhhaorzoixzzeelllnzhpp,d,
3.6047871112823486 kkkkb,lebmvzu txshiuufffiytnd lrgzhdp,  
3.594050884246826 kk,kc,,pbx lu ihfhvehloixzsee llazhddddn
3.5908055305480957 kkkkpppeivbzuejhgha rhvixzte

KeyboardInterrupt: 

In [22]:
len(data["translation"][0]) , data["translation"][0]

(40, 'It appears that this has been succeeded.')

In [23]:
train_y_list[0].size(1)

40

In [24]:
pred = model(train_x_list[0])
pred.shape

torch.Size([1, 40, 30])

In [25]:
prediction

'it appears that this has been succeeded '