In [1]:
import os
import argparse
import numpy as np
import tensorflow as tf

from tensorflow.python.ops import gen_audio_ops as audio_ops

In [2]:
Label_classes = ["_silence_", "_unknown_", "one","two","three","four","five","six","seven","eight","nine","zero"]

def load_wav_file(wav_filename, desired_samples=16000, clip_len_s=1):
    decoded_wav_list = []
    wav_file = tf.io.read_file(wav_filename)
    decoded_wav = audio_ops.decode_wav(wav_file, desired_channels=1)
    #print(len(decoded_wav[0]))
    
    for i in range(0, len(decoded_wav[0])//(desired_samples*clip_len_s)):
        wav_file_clip = decoded_wav[0][i*(desired_samples*clip_len_s):(i+1)*(desired_samples*clip_len_s)]
        decoded_wav_list.append(wav_file_clip)

    return decoded_wav_list, decoded_wav.sample_rate

def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc):
    
    spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, magnitude_squared=True)

    mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc)

    return mfcc_features
    
def calculate_accuracy(predicted_indices, expected_indices):
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy    

In [3]:
def _process_path(wav_filename, clip_duration_ms=1000):
    desired_samples = int(16000 * clip_duration_ms / 1000)
    decoded_wav_list, sample_rate = load_wav_file(wav_filename, desired_samples)

    window_size_samples = int(sample_rate * 40 / 1000)
    window_stride_samples = int(sample_rate * 20 / 1000)
    dct_coefficient_count = 10

    mfcc_list = []
    for decoded_wav in decoded_wav_list:
        mfcc = calculate_mfcc(decoded_wav, sample_rate, window_size_samples, window_stride_samples, dct_coefficient_count)
        mfcc = tf.reshape(mfcc, [-1])
        mfcc_list.append(mfcc)

    return mfcc_list

In [4]:
class tflite_inference_class:
    def __init__(self, tflite_path):
        self.tflite_path = tflite_path
        self.interpreter = self.init_tflite_model()
        self.input_details = self.interpreter.get_input_details()
        self.output_details = self.interpreter.get_output_details()
        self.input_dtype = self.input_details[0]["dtype"]
        self.output_dtype = self.output_details[0]["dtype"]
        if self.input_dtype == np.int8:
            self.input_scale, self.input_zero_point = self.input_details[0]["quantization"]
        else:
            self.input_scale, self.input_zero_point = 1, 0
        if self.output_dtype == np.int8:
            self.output_scale, self.output_zero_point = self.output_details[0]["quantization"]
        else:
            self.output_scale, self.output_zero_point = 1, 0

    def init_tflite_model(self):
        print("init_tflite_model")
        interpreter = tf.lite.Interpreter(model_path=self.tflite_path)
        interpreter.allocate_tensors()
        return interpreter    
    def tflite_inference(self, input_data):
        input_data = input_data / self.input_scale + self.input_zero_point
        input_data = np.round(input_data) if self.input_dtype == np.int8 else input_data

        self.interpreter.set_tensor(self.input_details[0]['index'], tf.cast(input_data, self.input_dtype))
        self.interpreter.invoke()
     
        output_data = self.interpreter.get_tensor(self.output_details[0]['index'])
        output_data = self.output_scale * (output_data.astype(np.float32) - self.output_zero_point)
     
        return output_data
        

In [7]:
def inference_wav(wav_filename, tflite_path, Label_classes):
    mfcc_list = _process_path(wav_filename, clip_duration_ms=1000)
    print("Total clips number: {}, and every clip is 1 (s)".format(len(mfcc_list)))
    print("Input dims: {}".format(mfcc_list[0].shape.as_list()))
    
    predicted_indices = []
    tflite_task = tflite_inference_class(tflite_path)
    for mfcc in mfcc_list:
        mfcc = tf.expand_dims(mfcc, axis=0)
        #print(mfcc.shape.as_list())
        prediction = tflite_task.tflite_inference(mfcc)
        predicted_indices.append(np.squeeze(tf.argmax(prediction, axis=1)))
    return predicted_indices

def show_result(predicted_indices, Label_classes):
    unique_elements, counts = np.unique(np.array(predicted_indices), return_counts=True)
    fake_expected_indices = unique_elements[np.argmax(counts)]
    #fake_expected_indices = 6 # Or you can assign a real ans here
    print(f'Fake Label is {Label_classes[fake_expected_indices]}')
    expected_indices = np.full(len(predicted_indices), fake_expected_indices)
    
    test_accuracy = calculate_accuracy(predicted_indices, expected_indices)
    confusion_matrix = tf.math.confusion_matrix(expected_indices, predicted_indices, num_classes=12)
    
    print(f'Test accuracy = {test_accuracy * 100:.2f}%'f'(N={len(predicted_indices)})')
    print(confusion_matrix.numpy())

## Clean wav to kws

In [25]:
wav_filename = r"MS-SNSD\kws_noisy_v1\CleanSpeech_training_kws\clnsp3.wav"
tflite_path = r"ds_cnn_int8quant.tflite"

show_result(inference_wav(wav_filename, tflite_path, Label_classes), Label_classes)

Total clips number: 60, and every clip is 1 (s)
Input dims: [490]
init_tflite_model
Fake Label is one
Test accuracy = 93.33%(N=60)
[[ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  3 56  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]]


## Noisy_db_0 + clean wav to kws

In [27]:
wav_filename = r"MS-SNSD\kws_noisy_v1\NoisySpeech_training_kws\noisy3_SNRdb_0.0_clnsp3.wav"
tflite_path = r"ds_cnn_int8quant.tflite"

show_result(inference_wav(wav_filename, tflite_path, Label_classes), Label_classes)

Total clips number: 60, and every clip is 1 (s)
Input dims: [490]
init_tflite_model
Fake Label is one
Test accuracy = 65.00%(N=60)
[[ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  8 39  2  1  1  0  1  2  5  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]]


## Noisy_db_10 + clean wav to kws

In [28]:
wav_filename = r"MS-SNSD\kws_noisy_v1\NoisySpeech_training_kws\noisy3_SNRdb_10.0_clnsp3.wav"
tflite_path = r"ds_cnn_int8quant.tflite"

show_result(inference_wav(wav_filename, tflite_path, Label_classes), Label_classes)

Total clips number: 60, and every clip is 1 (s)
Input dims: [490]
init_tflite_model
Fake Label is one
Test accuracy = 73.33%(N=60)
[[ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  8 44  1  0  1  0  2  1  3  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]]


## Noisy_db_20 + clean wav to kws

In [29]:
wav_filename = r"MS-SNSD\kws_noisy_v1\NoisySpeech_training_kws\noisy3_SNRdb_20.0_clnsp3.wav"
tflite_path = r"ds_cnn_int8quant.tflite"

show_result(inference_wav(wav_filename, tflite_path, Label_classes), Label_classes)

Total clips number: 60, and every clip is 1 (s)
Input dims: [490]
init_tflite_model
Fake Label is one
Test accuracy = 76.67%(N=60)
[[ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  7 46  2  0  2  0  0  1  1  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]]


## Noisy_db_0 + clean wav to kws + rnn-noisy filtered

In [137]:
wav_filename = r"MS-SNSD\kws_noisy_v1\noisy1_SNRdb_0.0_clnsp1_filtered.wav"
tflite_path = r"ds_cnn_int8quant.tflite"

show_result(inference_wav(wav_filename, tflite_path, Label_classes), Label_classes)

Total clips number: 60, and every clip is 1 (s)
Input dims: [490]
init_tflite_model
Fake Label is _silence_
Test accuracy = 45.00%(N=60)
[[27  1  1  1  1  1 19  0  0  1  8  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]]


## Original noisy sample

In [40]:
wav_filename = r"_noisy_sample_kws.wav"
tflite_path = r"ds_cnn_int8quant.tflite"

show_result(inference_wav(wav_filename, tflite_path, Label_classes), Label_classes)

Total clips number: 60, and every clip is 1 (s)
Input dims: [490]
init_tflite_model
Fake Label is one
Test accuracy = 83.33%(N=60)
[[ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  1 50  1  0  4  2  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]]


## Filtered sample

In [41]:
wav_filename = r"_filtered_sample_kws.wav"
tflite_path = r"ds_cnn_int8quant.tflite"

show_result(inference_wav(wav_filename, tflite_path, Label_classes), Label_classes)

Total clips number: 60, and every clip is 1 (s)
Input dims: [490]
init_tflite_model
Fake Label is one
Test accuracy = 76.67%(N=60)
[[ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  4 46  2  0  6  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0]]


In [37]:
import shutil

def move_files_with_pattern(source_dir, destination_dir, pattern="_0.0_"):
    # Ensure the destination directory exists, create it if not
    os.makedirs(destination_dir, exist_ok=True)
    
    # List all files in the source directory
    files = os.listdir(source_dir)
    
    # Iterate through the files and move those that match the pattern
    for file_name in files:
        # Check if the file name contains the pattern
        if pattern in file_name:
            # Define the full file paths
            source_file = os.path.join(source_dir, file_name)
            destination_file = os.path.join(destination_dir, file_name)
            
            # Move the file
            shutil.copy2(source_file, destination_file)
            print(f"Moved: {file_name}")

# Example usage
source_directory = r'C:\CYCHEN38\MICRO_ML\nnom\examples\rnn-denoise\MS-SNSD\kws_noisy_v1\NoisySpeech_training_kws'
destination_directory = r'C:\CYCHEN38\MICRO_ML\NSNet\dataset\kws\training\noisy'

# Move files containing "_0.0_" in their name
move_files_with_pattern(source_directory, destination_directory)

Moved: noisy100_SNRdb_0.0_clnsp100.wav
Moved: noisy10_SNRdb_0.0_clnsp10.wav
Moved: noisy11_SNRdb_0.0_clnsp11.wav
Moved: noisy12_SNRdb_0.0_clnsp12.wav
Moved: noisy13_SNRdb_0.0_clnsp13.wav
Moved: noisy14_SNRdb_0.0_clnsp14.wav
Moved: noisy15_SNRdb_0.0_clnsp15.wav
Moved: noisy16_SNRdb_0.0_clnsp16.wav
Moved: noisy17_SNRdb_0.0_clnsp17.wav
Moved: noisy18_SNRdb_0.0_clnsp18.wav
Moved: noisy19_SNRdb_0.0_clnsp19.wav
Moved: noisy1_SNRdb_0.0_clnsp1.wav
Moved: noisy20_SNRdb_0.0_clnsp20.wav
Moved: noisy21_SNRdb_0.0_clnsp21.wav
Moved: noisy22_SNRdb_0.0_clnsp22.wav
Moved: noisy23_SNRdb_0.0_clnsp23.wav
Moved: noisy24_SNRdb_0.0_clnsp24.wav
Moved: noisy25_SNRdb_0.0_clnsp25.wav
Moved: noisy26_SNRdb_0.0_clnsp26.wav
Moved: noisy27_SNRdb_0.0_clnsp27.wav
Moved: noisy28_SNRdb_0.0_clnsp28.wav
Moved: noisy29_SNRdb_0.0_clnsp29.wav
Moved: noisy2_SNRdb_0.0_clnsp2.wav
Moved: noisy30_SNRdb_0.0_clnsp30.wav
Moved: noisy31_SNRdb_0.0_clnsp31.wav
Moved: noisy32_SNRdb_0.0_clnsp32.wav
Moved: noisy33_SNRdb_0.0_clnsp33.wav
Mov