In [1]:
# imports
import numpy as np
from collections import ChainMap
import argparse
import random
import os
import datetime
import re
import hashlib
from enum import Enum
import librosa
import pcen
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch._six import with_metaclass
from torch._C import _ImperativeEngine as ImperativeEngine

In [2]:
def default_config():
    config = {}
    # model set
    config["group_speakers_by_id"] = True
    config["input_file"] = None   # model\EdgeCRNN-2x.pt
    config["n_labels"] = 12
    config["no_cuda"] = False

    # input shape
    config["silence_prob"] = 0.1
    config["noise_prob"] = 0.8
    config["n_dct_filters"] = 40
    config["input_length"] = 16000
    config["n_mels"] = 13  # MFCC-》39， log_mel->13 PCEN
    config["timeshift_ms"] = 100
    config["unknown_prob"] = 0.1
    config["train_pct"] = 80
    config["dev_pct"] = 10
    config["test_pct"] = 10
    config["wanted_words"] = ["zero", "one ", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
    config["data_folder"] = "/home/koushik/Documents/code/repos/KWS/data"
    config["audio_preprocess_type"] = "MFCCs"
    config["feature_type"] = "log_mel"  # ["MFCC", "log_mel", PCEN]

    # train parameter
    config["n_epochs"] = 1
    config["add_noise"] = True
    config["type"] = "train"  # [train, eval]
    config["loss"] = "focal"  # ["CE", "focal"]
    config["optimizer"] = "adam"  # ["adam", "sgd"]
    config["model_type"] = "EdgeCRNN"  # ["EdgeCRNN","shuffleNet", "Tpool2",
    #  "rnn", "mobileNet", "mobileNetV3-Small", "mobileNetV3-Large"]
    config["preprocess_data"] = 2  # [1, 2]  2 online, 1 offline
    config["width_mult"] = 1
    config["output_file"] = "model/EdgeCRNN"  #

    return config

In [3]:
class ConfigBuilder(object):
    def __init__(self, *default_configs):
        self.default_config = ChainMap(*default_configs)

    def build_argparse(self):
        parser = argparse.ArgumentParser()
        for key, value in self.default_config.items():
            key = "--{}".format(key)
            if isinstance(value, tuple):
                parser.add_argument(key, default=list(value), nargs=len(value), type=type(value[0]))
            elif isinstance(value, list):
                parser.add_argument(key, default=value, nargs="+", type=type(value[0]))
            elif isinstance(value, bool) and not value:
                parser.add_argument(key, action="store_true")
            else:
                parser.add_argument(key, default=value, type=type(value))
        return parser

    def config_from_argparse(self, parser=None):
        if not parser:
            parser = self.build_argparse()
        args = vars(parser.parse_known_args()[0])
        return args

In [4]:
def frist_conv(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, 1, 1, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU(inplace=True)  # nn.Relu()
    )

def conv_1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        # nn.PReLU()
        nn.ReLU(inplace=True)
    )

def channel_shuffle(x, groups):
    batchsize, num_channels, height, width = x.data.size()

    channels_per_group = num_channels // groups

    # reshape
    x = x.view(batchsize, groups, channels_per_group, height, width)

    x = torch.transpose(x, 1, 2).contiguous()  # (batchsize, channels_per_group, groups, height, width)

    # flatten
    x = x.view(batchsize, -1, height, width)  # (batchsize, -1, height, width)

    return x

def Base_block(oup_inc, stride):

    banch = nn.Sequential(
        # pw
        nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup_inc),
        nn.ReLU(inplace=True),
        # dw
        nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
        nn.BatchNorm2d(oup_inc),
        # pw-linear
        nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup_inc),
        nn.ReLU(inplace=True),
    )
    return banch

def EdgeCRNN_block(inp, oup_inc, stride):
    left_banch = nn.Sequential(
        # dw
        nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
        nn.BatchNorm2d(inp),
        # pw-linear
        nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup_inc),
        nn.ReLU(inplace=True),
    )
    right_banch = nn.Sequential(
                # pw
                nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup_inc),
                nn.ReLU(inplace=True),
                # dw
                nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
                nn.BatchNorm2d(oup_inc),
                # pw-linear
                nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup_inc),
                nn.ReLU(inplace=True),
            )
    return left_banch, right_banch

class EdgeCRNN_Residual(nn.Module):
    def __init__(self, inp, oup, stride, benchmodel):
        super(EdgeCRNN_Residual, self).__init__()
        self.benchmodel = benchmodel
        self.stride = stride
        assert stride in [1, 2]

        oup_inc = oup // 2

        if self.benchmodel == 1:
            # assert inp == oup_inc
            self.banch2 = Base_block(oup_inc, stride)
        else:
            self.banch1, self.banch2 = EdgeCRNN_block(inp, oup_inc, stride)

    @staticmethod
    def _concat(x, out):
        # concatenate along channel axis
        return torch.cat((x, out), 1)

    def forward(self, x):
        if 1 == self.benchmodel:
            x1 = torch.chunk(x, 2, 1)[0]
            x2 = torch.chunk(x, 2, 1)[1]
            out = self._concat(x1, self.banch2(x2))
        elif 2 == self.benchmodel:
            out = self._concat(self.banch1(x), self.banch2(x))

        return channel_shuffle(out, 2)

class EdgeCRNN(nn.Module):
    def __init__(self, n_class=12, input_size=101, width_mult=1.):
        super(EdgeCRNN, self).__init__()

        # assert input_size % 32 == 0

        self.stage_repeats = [2, 3, 2]
        # index 0 is invalid and should never be called.
        # only used for indexing convenience.
        if width_mult == 0.5:
            self.stage_out_channels = [-1, 16, 32, 64, 128, 256]  # *2  *2  16,  32,  64, 128, 256
        elif width_mult == 1.0:
            self.stage_out_channels = [-1, 24, 72, 144, 288, 512]  # *4.9 *2  24, 72, 144, 288, 512
        elif width_mult == 1.5:
            self.stage_out_channels = [-1, 24, 116, 232, 464, 1024]  # *7.3 *2
        elif width_mult == 2.0:
            self.stage_out_channels = [-1, 24, 160, 320, 640, 1024]  # *9.3  *2
        else:
            raise ValueError(
                """groups is not supported for
                       1x1 Grouped Convolutions""")
        # building first layer
        input_channel = self.stage_out_channels[1]
        self.conv1 = frist_conv(1, input_channel)  # 1 dim
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.features = []
        # building Stage2-4
        for idxstage in range(len(self.stage_repeats)):
            numrepeat = self.stage_repeats[idxstage]
            output_channel = self.stage_out_channels[idxstage + 2]
            for i in range(numrepeat):
                if i == 0:
                    # inp, oup, stride, benchmodel):
                    self.features.append(EdgeCRNN_Residual(input_channel, output_channel, 2, 2))
                else:
                    self.features.append(EdgeCRNN_Residual(input_channel, output_channel, 1, 1))
                input_channel = output_channel
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)  # 16层网络
        # building last several layers
        self.conv_last = conv_1x1_bn(input_channel, self.stage_out_channels[-1])

        self.globalpool = nn.Sequential(nn.AvgPool2d((3, 1), stride=(1, 1)))  # rnn->cnn (3,1)->(3, 7)
        # first-layer(3,1),other(2,1)； cnn first（3,7），other（2,4）

        # add RNN block
        self.hidden_size = 64
        # self.RNN = nn.RNN(self.stage_out_channels[-1], self.hidden_size, num_layers=1, batch_first=True)
        self.RNN = nn.LSTM(self.stage_out_channels[-1], self.hidden_size, num_layers=1, batch_first=True)
        # self.RNN = nn.GRU(self.stage_out_channels[-1], self.hidden_size, num_layers=1, batch_first=True)
        self.classifier = nn.Sequential(nn.Linear(self.hidden_size, n_class))

        # building classifier CNN
        # self.classifier = nn.Sequential(nn.Linear(self.stage_out_channels[-1], n_class))

    def forward(self, x):
        # print(x.shape)
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.features(x)
        x = self.conv_last(x)
        # print(x.shape)
        x = self.globalpool(x)  # shape(64,1024,1,4)

        # CNN
        # x = x.squeeze()
        # x = x.view(-1, self.stage_out_channels[-1])

        # add RNN block
        x = x.squeeze(dim=2).permute(0, 2, 1)  # shape(64,1024,1,4)--> shape(b, w, c)  (64, 7, 1024)
        self.RNN.flatten_parameters()
        x, _ = self.RNN(x)  # shape(64, 7, 1024)
        x = x.permute(0, 2, 1).mean(2)  # shape(1, 64,1024)--> (64,1024, 7)

        x = self.classifier(x)
        return x

In [5]:
def set_seed(config):
    seed = int(config["seed"])
    torch.manual_seed(seed)
    np.random.seed(seed)
    if not config["no_cuda"]:
        torch.cuda.manual_seed(seed)
    random.seed(seed)

In [6]:
class VariableMeta(type):
    def __instancecheck__(cls, other):
        return isinstance(other, torch.Tensor)

class Variable(with_metaclass(VariableMeta, torch._C._LegacyVariableBase)):  # type: ignore[misc]
    pass

Variable._execution_engine = ImperativeEngine()

class FocalLoss(nn.Module):
    r"""
        This criterion is a implemenation of Focal Loss, which is proposed in
        Focal Loss for Dense Object Detection.

            Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])

        The losses are averaged across observations for each minibatch.
        Args:
            alpha(1D Tensor, Variable) : the scalar factor for this criterion
            gamma(float, double) : gamma > 0; reduces the relative loss for well-classiﬁed examples (p > .5),
                                   putting more focus on hard, misclassiﬁed examples
            size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch.
                                However, if the field size_average is set to False, the losses are
                                instead summed for each minibatch.
    """

    def __init__(self, alpha=torch.Tensor([1]), gamma=2, size_average=True):
        super(FocalLoss, self).__init__()

        if isinstance(alpha, Variable):
            self.alpha = alpha
        else:
            self.alpha = Variable(alpha)
        self.gamma = gamma
        self.size_average = size_average

    def forward(self, inputs, targets):
        N = inputs.size(0)
        C = inputs.size(1)
        P = F.softmax(inputs, dim=1)

        class_mask = inputs.data.new(N, C).fill_(0)
        class_mask = Variable(class_mask)
        ids = targets.view(-1, 1)  # shape(N, 1)
        class_mask.scatter_(1, ids.data, 1.)
        # print(class_mask)

        if inputs.is_cuda and not self.alpha.is_cuda:
            self.alpha = self.alpha.cuda()
        # alpha = self.alpha[ids.data.view(-1)]

        probs = (P * class_mask).sum(1).view(-1, 1)

        log_p = probs.log()
        # print('probs size= {}'.format(probs.size()))
        # print(probs)

        batch_loss = -self.alpha * (torch.pow((1 - probs), self.gamma)) * log_p
        # print('-----bacth_loss------')
        # print(batch_loss)

        if self.size_average:
            loss = batch_loss.mean()
        else:
            loss = batch_loss.sum()
        return loss

In [7]:
class DatasetType(Enum):
    TRAIN = 0
    DEV = 1
    TEST = 2

class SpeechDataset(data.Dataset):
    LABEL_SILENCE = "__silence__"  # public static variable
    LABEL_UNKNOWN = "__unknown__"
    def __init__(self, data, set_type, config):
        super().__init__()
        self.config = config
        self.audio_files = list(data.keys())
        self.set_type = set_type
        self.audio_labels = list(data.values())
        config["bg_noise_files"] = list(filter(lambda x: x.endswith("wav"), config.get("bg_noise_files", [])))
        self.bg_noise_audio = [librosa.core.load(file, sr=16000)[0] for file in config["bg_noise_files"]]
        self.unknown_prob = config["unknown_prob"]
        self.silence_prob = config["silence_prob"]
        self.noise_prob = config["noise_prob"]
        self.input_length = config["input_length"]
        self.timeshift_ms = config["timeshift_ms"]
        self._audio_cache = SimpleCache(config["cache_size"])
        self._file_cache = SimpleCache(config["cache_size"])
        n_unk = len(list(filter(lambda x: x == 1, self.audio_labels)))
        self.n_silence = int(self.silence_prob * (len(self.audio_labels) - n_unk))
        self.audio_processor = AudioPreprocessor(n_mels=config["n_mels"], n_dct_filters=config["n_dct_filters"], hop_ms=10, config=config)
        self.audio_preprocess_type = config["audio_preprocess_type"]
        self.n_mels = config["n_mels"]

    @staticmethod
    def default_config():
        config = {}
        config["group_speakers_by_id"] = True
        config["silence_prob"] = 0.1
        config["noise_prob"] = 0.8
        config["n_dct_filters"] = 40
        config["input_length"] = 16000
        config["n_mels"] = 40
        config["timeshift_ms"] = 100
        config["unknown_prob"] = 0.1
        config["train_pct"] = 80
        config["dev_pct"] = 10
        config["test_pct"] = 10
        config["wanted_words"] = ["command", "random"]
        config["data_folder"] = "data/speech_dataset"
        config["audio_preprocess_type"] = "MFCCs"
        return config

    def collate_fn(self, data):
        x = None
        y = []
        mult = 1
        if self.config["feature_type"] == "log_mel":
            mult = 3
        # print("collate star:", time.time())
        for audio_data, label in data:   # data and label
            if self.audio_preprocess_type == "MFCCs":
                audio_tensor = torch.from_numpy(self.audio_processor.compute_mfccs(audio_data).reshape(1, self.config["n_mels"]*mult, 101))  # shape（b, h, w）
                x = audio_tensor if x is None else torch.cat((x, audio_tensor), 0)
            elif self.audio_preprocess_type == "PCEN":
                audio_tensor = torch.from_numpy(np.expand_dims(audio_data, axis=0))
                audio_tensor = self.audio_processor.compute_pcen(audio_tensor)
                x = audio_tensor if x is None else torch.cat((x, audio_tensor), 0)
            y.append(label)
        # print("collate end:", time.time())
        return x, torch.tensor(y)

    def _timeshift_audio(self, data):
        shift = (16000 * self.timeshift_ms) // 1000
        shift = random.randint(-shift, shift)
        a = -min(0, shift)
        b = max(0, shift)
        data = np.pad(data, (a, b), "constant")
        return data[:len(data) - a] if a else data[b:]

    def load_audio(self, example, silence=False):
        if silence:
            example = "__silence__"
        if random.random() < 0.7:
            try:
                return self._audio_cache[example]
            except KeyError:
                pass
        in_len = self.input_length
        if self.bg_noise_audio:
            bg_noise = random.choice(self.bg_noise_audio)
            a = random.randint(0, len(bg_noise) - in_len - 1)
            bg_noise = bg_noise[a:a + in_len]
        else:
            bg_noise = np.zeros(in_len)

        if silence:
            data = np.zeros(in_len)
        else:
            # augmenter = Compose([
            #     AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
            #     # TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
            #     PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
            #     Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
            # ])
            file_data = self._file_cache.get(example)
            data = librosa.core.load(example, sr=16000)[0] if file_data is None else file_data
            # data = augmenter(samples=data, sample_rate=16000)
            self._file_cache[example] = data
        data = np.pad(data, (0, max(0, in_len - len(data))), "constant")
        if self.set_type == DatasetType.TRAIN:
            data = self._timeshift_audio(data)
        if self.config["add_noise"]:
            if random.random() < self.noise_prob or silence:
                a = random.random() * 0.1
                data = np.clip(a * bg_noise + data, -1, 1)
        # data = np.clip(data, -1, 1)

        self._audio_cache[example] = data
        return data

    @classmethod
    def splits(cls, config):
        folder = config["data_folder"]  # data/speech_dataset
        wanted_words = config["wanted_words"] # ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
        unknown_prob = config["unknown_prob"] # 0.1
        train_pct = config["train_pct"]  # 80
        dev_pct = config["dev_pct"]  # 10
        test_pct = config["test_pct"]  # 10

        words = {word: i + 2 for i, word in enumerate(wanted_words)}
        # {'yes': 2, 'no': 3, 'up': 4, 'down': 5, 'left': 6, 'right': 7, 'on': 8, 'off': 9, 'stop': 10, 'go': 11}
        words.update({cls.LABEL_SILENCE:0, cls.LABEL_UNKNOWN:1})
        sets = [{}, {}, {}]
        unknowns = [0] * 3
        bg_noise_files = []
        unknown_files = []

        for folder_name in os.listdir(folder):
            path_name = os.path.join(folder, folder_name)   # data/speech_dataset/yes
            is_bg_noise = False
            if os.path.isfile(path_name):
                continue
            if folder_name in words:
                label = words[folder_name]
            elif folder_name == "_background_noise_":
                is_bg_noise = True
            else:
                label = words[cls.LABEL_UNKNOWN]

            for filename in os.listdir(path_name):
                wav_name = os.path.join(path_name, filename)  # data/speech_dataset/down/00b01445_nohash_1.wav
                if is_bg_noise and os.path.isfile(wav_name):
                    bg_noise_files.append(wav_name)
                    continue
                elif label == words[cls.LABEL_UNKNOWN]:  # here the one\four folder is the UNKNOWN
                    unknown_files.append(wav_name)
                    continue
                if config["group_speakers_by_id"]:
                    hashname = re.sub(r"_nohash_.*$", "", filename)
                max_no_wavs = 2**27 - 1
                bucket = int(hashlib.sha1(hashname.encode()).hexdigest(), 16)
                # hash values  hexdigest() return 16 jinzhi
                bucket = (bucket % (max_no_wavs + 1)) * (100. / max_no_wavs)
                if bucket < dev_pct:
                    tag = DatasetType.DEV   # TRAIN = 0, DEV = 1, TEST = 2
                elif bucket < test_pct + dev_pct:  # dev_pct = 10, test_pct = 10, train_pct = 80
                    tag = DatasetType.TEST
                else:
                    tag = DatasetType.TRAIN
                if config["type"] == "eval":
                    sets[2][wav_name] = label
                elif config["type"] == "train":
                    sets[tag.value][wav_name] = label
                #  sets = [
                # train  {'00b01445_nohash_1': 1, },  length = 16696
                # dev    {'00b01443_nohash_1': 2, },  length = 2316
                # test   {'00b01441_nohash_1': 3, }   length = 2311
                #  ]

        for tag in range(len(sets)):
            unknowns[tag] = int(unknown_prob * len(sets[tag]))  # train length, validation, test
        random.shuffle(unknown_files)
        a = 0
        for i, dataset in enumerate(sets):
            b = a + unknowns[i]
            unk_dict = {u: words[cls.LABEL_UNKNOWN] for u in unknown_files[a:b]}
            dataset.update(unk_dict)
            a = b
            # unk_dict = {
            #   0:len(train_dataset)-1,
            #   len(train_dataset): len(train+dev_dataset)-1
            #   len(train+dev):len(train+dev+test)-1
            # }
        train_cfg = ChainMap(dict(bg_noise_files=bg_noise_files), config)
        test_cfg = ChainMap(dict(bg_noise_files=bg_noise_files, noise_prob=0), config)
        # print(test_cfg)
        datasets = (cls(sets[0], DatasetType.TRAIN, train_cfg), cls(sets[1], DatasetType.DEV, test_cfg),
                    cls(sets[2], DatasetType.TEST, config))
        return datasets

    def __getitem__(self, index):
        if index >= len(self.audio_labels):
            return self.load_audio(None, silence=True), 0
        return self.load_audio(self.audio_files[index]), self.audio_labels[index]

    def __len__(self):
        # return len(self.audio_labels) + self.n_silence
        return len(self.audio_labels)

In [8]:
class AudioPreprocessor(object):
    def __init__(self, sr=16000, n_dct_filters=40,  n_mels=40, f_max=4000, f_min=20, n_fft=480, hop_ms=10, config=None):
        super().__init__()
        self.config = config
        self.n_mels = n_mels  # 40
        self.sr = sr
        self.f_max = f_max if f_max is not None else sr // 2  # 4000
        self.f_min = f_min  # 20
        self.n_fft = n_fft  # duan shi fu li ye 480
        self.hop_length = sr // 1000 * hop_ms
        self.pcen_transform = pcen.StreamingPCENTransform(n_mels=n_mels, n_fft=n_fft, hop_length=self.hop_length, trainable=True)

    def compute_mfccs(self, data):
        mfcc = librosa.feature.mfcc(
            data,
            sr=self.sr,
            n_mfcc=self.n_mels,
            hop_length=self.hop_length
        )
        mfcc = np.array(mfcc, order="F").astype(np.float32)

        if self.config["feature_type"] == "log_mel":
            mel_spec = librosa.feature.melspectrogram(
                data,
                sr=self.sr,  # 16000
                n_mels=self.n_mels,  # 40
                hop_length=self.hop_length,  # 160
                n_fft=self.n_fft,  # 480
                fmin=self.f_min,  # 20
                fmax=self.f_max  # 4000
            )
            # data[data > 0] = np.log(data[data > 0])
            # data = [np.matmul(self.dct_filters, x) for x in np.split(data, data.shape[1], axis=1)]
            mel_spec = np.array(mel_spec, order="F").astype(np.float32)

            log_mel = librosa.power_to_db(mel_spec)
            delta = librosa.feature.delta(mfcc)
            delta_delta = librosa.feature.delta(delta)
            data = np.vstack([log_mel, delta, delta_delta])  # (120, 101)
            return data  # shape(120, 101, 1)
        elif self.config["feature_type"] == "MFCC":
            # print(mfcc.shape)
            return mfcc  # data shape(40，101)
        elif self.config["feature_type"] == "PCEN":
            spec = librosa.feature.melspectrogram(data, self.sr, power=1, n_mels=self.n_mels, hop_length=self.hop_length, n_fft=self.n_fft)
            pcen = librosa.pcen(spec, self.sr)  # (40,101)
            pcen = np.array(pcen, order="F").astype(np.float32)
            return pcen

    def compute_pcen(self, data):
        data = self.pcen_transform(data)
        self.pcen_transform.reset()
        return data

In [9]:
class SimpleCache(dict):
    def __init__(self, limit):
        super().__init__()
        self.limit = limit
        self.n_keys = 0

    def __setitem__(self, key, value):
        if key in self.keys():
            super().__setitem__(key, value)
        elif self.n_keys < self.limit:
            self.n_keys += 1
            super().__setitem__(key, value)
        return value
    
def print_eval(name, scores, labels, loss, step=0, interval=50, file=None, model_type=None, end="\n"):
    batch_size = labels.size(0)
    # print(batch_size, scores.shape)
    accuracy = (torch.max(scores, 1)[1].view(batch_size).data == labels.data).float().sum() / batch_size
    if model_type == "eval":
        print("the predicted value:", torch.max(scores, 1)[1].numpy() - 2 )
        print("the  labels   value:", labels.numpy() - 2)
    if step % interval == 0:
        print_result = "{} accuracy: {:>5}, loss: {:<25}".format(name, accuracy, loss)
        if file:
            file.write(print_result+end)
        print(print_result)
    return accuracy.item()

def train(config):
    output_dir = os.path.dirname(os.path.abspath(config["output_file"]))
    train_path = "{}-{}-train.txt".format(config["output_file"], datetime.datetime.now().strftime("%m-%d %H.%M.%S"))

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    model = config["model"]
    if config["input_file"]:
        if not config["no_cuda"]:
            parameters = torch.load(config["input_file"])
        else:
            parameters = torch.load(config["input_file"], map_location='cpu')
        model.load_state_dict(parameters)
    if not config["no_cuda"]:
        print(config["gpu_no"])
        # torch.cuda.set_device(config["gpu_no"])
        model.cuda()
    if config["optimizer"] == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=config["lr"][0], nesterov=config["use_nesterov"], weight_decay=config["weight_decay"], momentum=config["momentum"])
    if config["optimizer"] == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"][0])
    if config["loss"] == "CE":
        criterion = nn.CrossEntropyLoss()
    if config["loss"] == "focal":
        criterion = FocalLoss()
    schedule_steps = config["schedule"]
    schedule_steps.append(np.inf)
    sched_idx = 0
    max_acc = 0
    # model = model.float()

    train_set, dev_set, test_set = SpeechDataset.splits(config)
    train_loader = data.DataLoader(
        train_set,
        batch_size=config["batch_size"],  # 64
        shuffle=True, drop_last=True,
        collate_fn=train_set.collate_fn,
        num_workers=4
    )
    dev_loader = data.DataLoader(
        dev_set,
        batch_size=min(len(dev_set), 16),
        shuffle=True,
        collate_fn=dev_set.collate_fn,
        num_workers=4
    )
    test_loader = data.DataLoader(
        test_set,
        batch_size=min(len(test_set), 16),
        shuffle=True,
        collate_fn=test_set.collate_fn,
        num_workers=4
    )
    step_no = 0

    train_file = open(train_path, "a")
    train_file.write(config["output_file"])
    for epoch_idx in range(config["n_epochs"]):
        train_accs = []
        print("epoch {} start time：{}".format(epoch_idx, datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")))
        for batch_idx, (model_in, labels) in enumerate(train_loader):  # 花费5秒-64, 7秒-128
            # print("for star:", time.time())
            model.train()  # switch model to train model
            optimizer.zero_grad()
            if not config["no_cuda"]:
                model = model.cuda()
                model_in = model_in.cuda()
                labels = labels.cuda()
            model_in = Variable(model_in, requires_grad=True)
            model_in = torch.unsqueeze(model_in, 1)
            scores = model(model_in)
            labels = Variable(labels, requires_grad=False).long()
            loss = criterion(scores, labels)
            loss.backward()
            optimizer.step()
            step_no += 1
            # print("for end:", time.time())
            train_accs.append(print_eval("[{}] train Epoch:{} step #{}".format(datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S"), epoch_idx, step_no),
                                         scores, labels, loss, step_no, file=train_file))

        # LR setting, 50 epoch
        if (epoch_idx + 1) % 50 == 0:
            lr_intel = config["lr"][0] - (epoch_idx + 1) / config["n_epochs"] * (config["lr"][0] - config["lr"][1])
            print("changing learning rate to {}".format(lr_intel))
            train_file.write("changing learning rate to {}".format(lr_intel))
            optimizer = torch.optim.Adam(model.parameters(), lr=lr_intel)

        print_log = "[{}] train Epoch:{} Accuracy：{}".format(datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S"), epoch_idx, np.mean(train_accs))
        print(print_log)
        train_file.write(print_log)
        print("epoch {} end  time：{}".format(epoch_idx, datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")))
        # 测试阶段
        with torch.no_grad():
            model.eval()
            accs = []
            index = 0
            total_score = torch.Tensor()
            total_label = torch.Tensor()
            for model_in, labels in dev_loader:
                model_in = Variable(model_in, requires_grad=False)
                model_in = torch.unsqueeze(model_in, 1)
                if not config["no_cuda"]:
                    model_in = model_in.cuda()
                    labels = labels.cuda()
                scores = model(model_in)
                labels = Variable(labels, requires_grad=False).long()
                if len(total_label):
                    total_label = torch.cat((total_label, labels))
                    total_score = torch.cat((total_score, scores))
                else:
                    total_label = labels
                    total_score = scores

                loss = criterion(scores, labels)
                index = index + 1
                accs.append(print_eval("[{}] dev Epoch:{} ".format(datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S"), epoch_idx),
                                       scores, labels, loss, index, file=train_file, interval=30))

            if not config["no_cuda"]:
                total_score = total_score.cpu()
                total_label = total_label.cpu()
            avg_acc = np.mean(accs)
            print("final dev accuracy: {}".format(avg_acc))
            train_file.write("[{}] Epochs {} final dev accuracy: {}\n".format(datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S"), epoch_idx, avg_acc))
            if avg_acc > max_acc:
                print("saving best model...")
                train_file.write("the best accuracy:{}, saving best model...\n".format(avg_acc))
                max_acc = avg_acc
                if max_acc > 0.90:
                    torch.save(model.state_dict(), config["output_file"]+"-{:.5f}.pt".format(avg_acc))
                    # 计算ROC曲线
                    y_one_hot = label_binarize(total_label, np.arange(config["n_labels"]))
                    fpr, tpr, thresholds = metrics.roc_curve(y_one_hot.ravel(), total_score.detach().numpy().ravel())
                    np.savetxt(config["output_file"] + "-{:.4f}-{}.csv".format(avg_acc, epoch_idx), [fpr, tpr],
                               delimiter=',', header="FPR,TPR")
                    config["input_file"] = config["output_file"]+"-{:.5f}.pt".format(avg_acc)  # input model

        evaluate(config, model, test_loader)

    train_file.close()

    # evaluate(config, model, test_loader)

In [10]:
def evaluate(config, model=None, test_loader=None):
    # config["feature_type"] = "log_mel"
    if not test_loader:
        _, _, test_set = dataset.SpeechDataset.splits(config)
        test_loader = data.DataLoader(
            test_set,
            batch_size=len(test_set),
            collate_fn=test_set.collate_fn)

    if model == None:
        model = config["model"]
        if config["input_file"]:
            if not config["no_cuda"]:
                parameters = torch.load(config["input_file"])
            else:
                parameters = torch.load(config["input_file"], map_location='cpu')
            model.load_state_dict(parameters)
        if not config["no_cuda"]:
            torch.cuda.set_device(config["gpu_no"])
            model.cuda()
    model.eval()
    results = []
    total = 0
    for model_in, labels in test_loader:
        model_in = Variable(model_in, requires_grad=False)
        model_in = torch.unsqueeze(model_in, 1)
        if not config["no_cuda"]:
            model_in = model_in.cuda()
            labels = labels.cuda()
        scores = model(model_in)
        labels = Variable(labels, requires_grad=False)
        # loss = criterion(scores, labels)
        total += model_in.size(0)
        results.append(print_eval("test", scores, labels, 0, total, model_type=config["type"]) * model_in.size(0))
    print("final test accuracy: {}".format(sum(results) / total))
    return sum(results) / total

In [11]:
def main():
    global_config = dict(lr=[0.001, 0.0001], schedule=[np.inf], batch_size=64, dev_every=1, seed=0,
                         model=None, use_nesterov=False, gpu_no=0, cache_size=32768, momentum=0.9, weight_decay=0.00001)
    builder = ConfigBuilder(default_config(), global_config)
    parser = builder.build_argparse()
    # parser.add_argument("--no_cuda", type=str2bool, nargs='?', const=True)

    config = builder.config_from_argparse(parser)

    model = EdgeCRNN(width_mult=config["width_mult"])
    model = torch.nn.DataParallel(model)
    
    config["model"] = model
    set_seed(config)
    if config["type"] == "train":
        train(config)
    elif config["type"] == "eval":
        evaluate(config)

In [12]:
main()

0
epoch 0 start time：2021-10-31 13.47.34
[2021-10-31 13.48.24] train Epoch:0 step #50 accuracy: 0.234375, loss: 1.5652639865875244       
[2021-10-31 13.49.09] train Epoch:0 step #100 accuracy: 0.671875, loss: 0.6917768716812134       
[2021-10-31 13.50.00] train Epoch:0 step #150 accuracy: 0.78125, loss: 0.41581717133522034      
[2021-10-31 13.50.46] train Epoch:0 step #200 accuracy: 0.8125, loss: 0.3848932981491089       
[2021-10-31 13.51.37] train Epoch:0 step #250 accuracy:  0.75, loss: 0.41717398166656494      
[2021-10-31 13.52.23] train Epoch:0 step #300 accuracy: 0.765625, loss: 0.3950932025909424       
[2021-10-31 13.53.13] train Epoch:0 step #350 accuracy: 0.921875, loss: 0.21312862634658813      
[2021-10-31 13.53.59] train Epoch:0 step #400 accuracy: 0.828125, loss: 0.2480033040046692       
[2021-10-31 13.54.49] train Epoch:0 step #450 accuracy: 0.859375, loss: 0.20732825994491577      
[2021-10-31 13.55.16] train Epoch:0 Accuracy：0.7328157484407485
epoch 0 end  time：20