In [76]:
import os
import torch
import librosa
import numpy as np
from torch.nn.functional import one_hot


features = []
labels = []
files = os.listdir("./data/recordings/")
# 读取文件 features: 经过mfcc处理的音频特征, labels: 每个音频的特征标签
for file in files:
    wave, sr = librosa.load("./data/recordings/"+file)
    mfcced_wave = librosa.feature.mfcc(y=wave, sr=sr)
    mfcced_wave = np.pad(mfcced_wave, ((0,0),(0,100-mfcced_wave.shape[1])),
                        mode='constant', constant_values=0)
    features.append(mfcced_wave)
    labels.append(int(file[0]))

# 数据读取到CUDA
features = torch.tensor(features).cuda()
labels = torch.tensor(labels).cuda()

# 对特征进行归一化, 对标签转换独热编码
mean_features = torch.mean(features, dim=2)
std_features = torch.std(features, dim=2)
features = features.sub_(mean_features[:,:,None]).div_(std_features[:,:,None])
labels = one_hot(labels, 10)

print(features.shape)
print(labels.shape)


torch.Size([3891, 20, 100])
torch.Size([3891, 10])


In [77]:
class CNNConfig():
    data_dims = 100
    filter_sizes = [2, 3, 4, 5]
    num_filters = 64
    num_classes = 10
    hidden_dim = 256
    keep_prob = 0.7

In [81]:
import torch.nn as nn
import torch.nn.functional as func

class ASRCNN(nn.Module):
    def __init__(self,config):
        super(ASRCNN, self).__init__()

        # 卷积层定义
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(in_channels=config.data_dims,
                      out_channels=config.num_filters,
                      kernel_size=filter_size)
                      for filter_size in config.filter_sizes
        ])

        # 池化层定义
        self.maxpool = nn.AdaptiveMaxPool1d(1)

        # 全连接层定义
        self.num_filters_total = config.num_filters * len(config.filter_sizes)
        self.fc1 = nn.Linear(self.num_filters_total, config.hidden_dim)
        self.fc2 = nn.Linear(config.hidden_dim, config.num_classes)

        # 神经元保留概率
        self.keep_prob = config.keep_prob

        # 初始化参数
        self.init_weights()
    
    def init_weights(self):
        for conv_layer in self.conv_layers:
            nn.init.xavier_uniform_(conv_layer.weight)

        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        
    def forward(self, x):
        x = x.permute(0, 2, 1)
        # 卷积操作
        conved_x_list = [func.relu(conv(x)) for conv in self.conv_layers]
        # 池化操作
        pooled_x_list = [self.maxpool(conved_x) for conved_x in conved_x_list]

        conned_x = torch.cat(pooled_x_list, dim=1)
        conned_x = conned_x.view(-1,self.num_filters_total)

        fc_x = func.relu(self.fc1(conned_x))
        fc_x = func.dropout(fc_x, self.keep_prob)

        outs = func.softmax(self.fc2(fc_x), dim=1)

        return outs
        

In [82]:
config = CNNConfig()
model = ASRCNN(config).cuda()
out = model.forward(features)
print(out.shape)

torch.Size([3891, 10])
