In [1]:
import os
import torch
import librosa
import numpy as np
from torch.nn.functional import one_hot
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class ASRDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        sample = {'features': self.features[idx], 'labels': self.labels[idx]}
        return sample

features = []
labels = []
files = os.listdir("./data/recordings/")

# 读取文件 features: 经过mfcc处理的音频特征, labels: 每个音频的特征标签
for file in files:
    wave, sr = librosa.load("./data/recordings/"+file)
    mfcced_wave = librosa.feature.mfcc(y=wave, sr=sr)
    mfcced_wave = np.pad(mfcced_wave, ((0,0),(0,100-mfcced_wave.shape[1])),
                        mode='constant', constant_values=0)
    features.append(mfcced_wave)
    labels.append(int(file[0]))

# 数据转移到GPU
features = torch.tensor(features).cuda()
labels = torch.tensor(labels).cuda()

# 对特征进行归一化, 标签转换独热编码
mean_features = torch.mean(features, dim=2)
std_features = torch.std(features, dim=2)
features = features.sub_(mean_features[:,:,None]).div_(std_features[:,:,None])
labels = one_hot(labels, 10)

# 对 recordings 数据的特征和标签打包并划分训练集,测试集
x_train, x_test, y_train, y_test = train_test_split(features, labels, train_size=0.7)
train_dataset = ASRDataset(x_train, y_train)
test_dataset = ASRDataset(x_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

  features = torch.tensor(features).cuda()


In [2]:
import torch.nn as nn
import torch.nn.functional as func

class ASRCNN(nn.Module):
    def __init__(self,config):
        super(ASRCNN, self).__init__()

        # 卷积层定义
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=config.data_dims,
                      out_channels=config.num_filters,
                      kernel_size=filter_size)
                      for filter_size in config.filter_sizes
        ])

        # 池化层定义
        self.maxpool = nn.AdaptiveMaxPool1d(1)

        # 全连接层定义
        self.num_filters_total = config.num_filters * len(config.filter_sizes)
        self.fc1 = nn.Linear(self.num_filters_total, config.hidden_dim)
        self.fc2 = nn.Linear(config.hidden_dim, config.num_classes)

        # 神经元保留概率
        self.keep_prob = config.keep_prob

        # 初始化参数
        self.init_weights()
    
    def init_weights(self):
        for conv_layer in self.conv_layers:
            nn.init.xavier_uniform_(conv_layer.weight)

        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        
    def forward(self, x):
        x = x.permute(0, 2, 1)
        # 卷积操作
        conved_x_list = [func.relu(conv(x)) for conv in self.conv_layers]
        # 池化操作
        pooled_x_list = [self.maxpool(conved_x) for conved_x in conved_x_list]

        conned_x = torch.cat(pooled_x_list, dim=1)
        conned_x = conned_x.view(-1,self.num_filters_total)

        fc_x = func.relu(self.fc1(conned_x))
        fc_x = func.dropout(fc_x, self.keep_prob)

        outs = func.softmax(self.fc2(fc_x), dim=1)

        return outs
        

In [3]:
class CNNConfig():
    data_dims = 100
    filter_sizes = [2, 3, 4, 5]
    num_filters = 64
    num_classes = 10
    hidden_dim = 256
    keep_prob = 0.7

config = CNNConfig()
model = ASRCNN(config).cuda()

In [4]:
import torch.optim as optim

loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9)

# 模型训练
epoch_num = 200

max_acc = 0.0
for epoch in range(epoch_num):
    loss_sum = 0.0
    total_batch = 0.0
    torch.cuda.empty_cache()
    for i, data in enumerate(train_loader):
        features, labels = data["features"].cuda(), data["labels"].cuda()
        features, labels = features.float(), labels.float()
        optimizer.zero_grad()
        outputs = model.forward(features)
        # 默认为单批次损失的均值
        loss = loss_func(outputs, labels)
        loss_sum += loss.item()
        total_batch += 1
        loss.backward()
        optimizer.step()

    loss_mean = loss_sum / total_batch

    correct_sum = 0.0
    total = 0.0
    torch.cuda.empty_cache()
    for data in test_loader:
        features, labels = data["features"].cuda(), data["labels"].cuda()
        features, labels = features.float(), labels.float()
        with torch.no_grad():
            outputs = model.forward(features)
            _, predicts = torch.max(outputs, 1)
            _, labels = torch.max(labels, 1)
            correct_sum += (predicts == labels).sum().item()
        total += labels.size(0)
    
    acc_mean = correct_sum / total * 100
    
    if (epoch+1) % 10 == 0:
        print("第{}次迭代-模型损失:{:.4f}; 准确率:{:.4f}%".format(epoch+1, loss_mean, acc_mean))

    if acc_mean > 90.0 and acc_mean > max_acc:
        max_acc = acc_mean
        print("保存模型")
        torch.save(model.state_dict(),'./data/models/ASRCNN_{}.rui'.format(acc_mean))



第10次迭代-模型损失:1.9487; 准确率:51.4555%
第20次迭代-模型损失:1.7561; 准确率:67.7226%
第30次迭代-模型损失:1.6926; 准确率:73.5445%
第40次迭代-模型损失:1.6029; 准确率:81.4212%
第50次迭代-模型损失:1.5567; 准确率:83.2192%
第60次迭代-模型损失:1.5477; 准确率:85.3596%
第70次迭代-模型损失:1.5310; 准确率:85.9589%
第80次迭代-模型损失:1.5274; 准确率:88.3562%
第90次迭代-模型损失:1.5142; 准确率:87.1575%
第100次迭代-模型损失:1.5054; 准确率:88.9555%
第110次迭代-模型损失:1.5003; 准确率:88.0993%
保存模型
第120次迭代-模型损失:1.5011; 准确率:88.8699%
第130次迭代-模型损失:1.4901; 准确率:89.7260%
第140次迭代-模型损失:1.5297; 准确率:88.6986%
保存模型
第150次迭代-模型损失:1.4893; 准确率:89.8973%
第160次迭代-模型损失:1.4848; 准确率:90.2397%
保存模型
第170次迭代-模型损失:1.4847; 准确率:90.5822%
保存模型
第180次迭代-模型损失:1.4836; 准确率:90.2397%
保存模型
第190次迭代-模型损失:1.4869; 准确率:89.9829%
第200次迭代-模型损失:1.4876; 准确率:88.3562%
