#### YAPILMIŞTI DÜZENLENECEK (VAD ve FILTERDAN SONRA)

In [2]:
# Gerekli kütüphaneler
import librosa
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn.functional import one_hot
import matplotlib.pyplot as plt
import os
import warnings        
warnings.filterwarnings("ignore")

In [3]:
# Veri işleme fonksiyonu
def read_and_sample(csv_path, fraction):
    data = pd.read_csv(csv_path)
    sample_data = data.sample(frac=fraction, random_state=42)
    return sample_data

In [4]:
# Verileri oku
sample_fraction = 0.001

train_data = read_and_sample("Datasets/STT_Datasets/train.csv", sample_fraction)
test_data = read_and_sample("Datasets/STT_Datasets/test.csv", sample_fraction)

In [5]:
# Verileri Kontrol et
print("Train Data Sample:")
print(train_data.head())
print("Test Data Sample:")
print(test_data.head())

Train Data Sample:
                                                    path  language
10125  Datasets\STT_Datasets\zh-CN\clips\common_voice...         1
20971  Datasets\STT_Datasets\zh-CN\clips\common_voice...         1
4745   Datasets\STT_Datasets\tr\clips\common_voice_tr...         0
18720  Datasets\STT_Datasets\zh-CN\clips\common_voice...         1
13701  Datasets\STT_Datasets\tr\clips\common_voice_tr...         0
Test Data Sample:
                                                   path  language
1652  Datasets\STT_Datasets\zh-CN\clips\common_voice...         1
5303  Datasets\STT_Datasets\tr\clips\common_voice_tr...         0
2987  Datasets\STT_Datasets\tr\clips\common_voice_tr...         0
1545  Datasets\STT_Datasets\zh-CN\clips\common_voice...         1
3767  Datasets\STT_Datasets\zh-CN\clips\common_voice...         1


In [6]:
def preprocess_audio(audio_path):
    audio, sr = librosa.load(audio_path)
    audio = librosa.util.normalize(audio)
    audio = librosa.feature.mfcc(audio)
    tensor = torch.from_numpy(audio).T
    tensor = tensor.unsqueeze(0)
    tensor = tensor.float()
    return tensor

In [7]:
# train verileri yükle
train_x_list, train_y_list = [], []
for data in train_data.itertuples():
    train_x_list.append(preprocess_audio(data.path))
    tensor_y = torch.tensor(data.language)
    train_y_list.append(one_hot(tensor_y, num_classes=2).unsqueeze(0).float())
    

In [8]:
# Parametreleri ayarla
epochs = 100
input_size = train_x_list[0].shape[2]
hidden_size = 256
layer_sizes = 2
output_size = 2
learning_rate=1e-4

In [9]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size,layer_sizes, output_size):
        super(Net, self).__init__()
        
        self.hidden_size = hidden_size
        self.layer_sizes = layer_sizes
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=layer_sizes, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p=0.5)
        
    def forward(self, x):
        h0 = torch.zeros(self.layer_sizes, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.layer_sizes, x.size(0), self.hidden_size)
        
        x = self.dropout(x)
        output, (hn, cn) = self.lstm(x, (h0, c0))
        output = self.fc(output[:,-1,:])
        return output

In [10]:
model = Net(input_size, hidden_size, layer_sizes, output_size)

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

In [12]:
for epoch in range(epochs):
    for audio, label in zip(train_x_list, train_y_list):
        optimizer.zero_grad()
        output = model(audio)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, epochs, loss.item()))

Epoch [1/100], Loss: 0.7092
Epoch [2/100], Loss: 0.7771


KeyboardInterrupt: 

In [None]:
audio = preprocess_audio("Datasets/STT_Datasets/zh-CN/clips/common_voice_zh-CN_18531543.mp3")

In [None]:
pred = model(audio)
torch.argmax(pred, dim=1),pred