# microIA project : bird recognition

## Imports

In [3]:

import os
from pathlib import Path
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Input, Conv1D, AvgPool1D, MaxPool1D, ZeroPadding1D, BatchNormalization, Flatten, Dense, Activation
from keras.utils.data_utils import get_file
from keras.utils.np_utils import to_categorical
import wave 
import xenocanto
import random



In [8]:
birds = ['Emberiza citrinella','Cuculus canorus','Emberiza cirlus','Muscicapa striata']
dataset_dir = Path('dataset')


## Download, cache and extract birds songs from xeno canto

In [59]:

if not (dataset_dir/'testing_list.txt').exists(): # Assume dataset already downloaded/extracted if testing list is present
    for bird in birds : 
        xenocanto.metadata([bird,"type:song","q:A"])
        xenocanto.metadata([bird,"type:song","q:B"])
        await xenocanto.download([bird,"type:song","q:A"],2)
        await xenocanto.download([bird,"type:song","q:B"],2)
        if bird == 'Muscicapa striata' :
            xenocanto.metadata([bird,"type:song","q:C"])
            await xenocanto.download([bird,"type:song","q:C"],2)


In [6]:
!powershell -Command "Get-ChildItem  . -Recurse -Filter *.mp3 -Exclude *_*.mp3 | ForEach-Object {  $input = $_.Fullname ; $newname = $_.Fullname.replace(\".mp3\",\"\")+\"_%03d.mp3\"; ffmpeg -i $input -f segment -segment_time 10 $newname; rm $input}"

ffmpeg version 6.0-essentials_build-www.gyan.dev Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 12.2.0 (Rev10, Built by MSYS2 project)
  configuration: --enable-gpl --enable-version3 --enable-static --disable-w32threads --disable-autodetect --enable-fontconfig --enable-iconv --enable-gnutls --enable-libxml2 --enable-gmp --enable-lzma --enable-zlib --enable-libsrt --enable-libssh --enable-libzmq --enable-avisynth --enable-sdl2 --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxvid --enable-libaom --enable-libopenjpeg --enable-libvpx --enable-libass --enable-libfreetype --enable-libfribidi --enable-libvidstab --enable-libvmaf --enable-libzimg --enable-amf --enable-cuda-llvm --enable-cuvid --enable-ffnvcodec --enable-nvdec --enable-nvenc --enable-d3d11va --enable-dxva2 --enable-libmfx --enable-libgme --enable-libopenmpt --enable-libopencore-amrwb --enable-libmp3lame --enable-libtheora --enable-libvo-amrwbenc --enable-libgsm --enable-libopencore-amrnb --enable-

In [10]:
!powershell -Command "Get-ChildItem  . -Recurse -Filter *.mp3 | ForEach-Object {  $input = $_.Fullname;$newname = $_.Fullname.replace(\".mp3\",\".wav\"); ffmpeg -i $input -ac 2 -ar 48000 $newname; rm $input}"

ffmpeg version 6.0-essentials_build-www.gyan.dev Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 12.2.0 (Rev10, Built by MSYS2 project)
  configuration: --enable-gpl --enable-version3 --enable-static --disable-w32threads --disable-autodetect --enable-fontconfig --enable-iconv --enable-gnutls --enable-libxml2 --enable-gmp --enable-lzma --enable-zlib --enable-libsrt --enable-libssh --enable-libzmq --enable-avisynth --enable-sdl2 --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxvid --enable-libaom --enable-libopenjpeg --enable-libvpx --enable-libass --enable-libfreetype --enable-libfribidi --enable-libvidstab --enable-libvmaf --enable-libzimg --enable-amf --enable-cuda-llvm --enable-cuvid --enable-ffnvcodec --enable-nvdec --enable-nvenc --enable-d3d11va --enable-dxva2 --enable-libmfx --enable-libgme --enable-libopenmpt --enable-libopencore-amrwb --enable-libmp3lame --enable-libtheora --enable-libvo-amrwbenc --enable-libgsm --enable-libopencore-amrnb --enable-

In [11]:


if not (dataset_dir/'testing_list.txt').exists(): # Assume dataset already downloaded/extracted if testing list is present

    CLASSES = os.listdir(dataset_dir/"audio")
    
    for c in CLASSES :
        files = os.listdir(dataset_dir/"audio"/c)
        for file in files:
            if file.endswith('.wav'):
                try:
                    wave.open(str(dataset_dir)+"/audio/"+c+"/"+str(file)) 
                except:
                    os.remove(str(dataset_dir)+"/audio/"+c+"/"+str(file)) 
                
    numOfrec =min([len(os.listdir(dataset_dir/"audio"/c)) for c in CLASSES])
    num_test = int(numOfrec*0.1)
    for c in CLASSES :
        files = os.listdir(dataset_dir/"audio"/c)
        for f in files[numOfrec:] :
            os.remove(dataset_dir/"audio"/c/f)
    os.open(dataset_dir/'testing_list.txt', os.O_CREAT)
    os.open(dataset_dir/'validation_list.txt', os.O_CREAT)
    for c in CLASSES :
        recs = [ rec for rec in os.listdir(dataset_dir/'audio'/c) if rec.endswith('.wav') ]
        randomrecs = random.sample(recs, num_test*2)
        for rec in randomrecs:
            if randomrecs.index(rec) < num_test:
                 with open(dataset_dir/'testing_list.txt', 'a') as f:
                        f.write(c + '/' + rec +'\n')
            else : 
                with open(dataset_dir/'validation_list.txt', 'a') as f:
                        f.write(c + '/' + rec +'\n')
    

## Test 1 : Load raw spoken digits data from Xeno Canto without the Yellow hammer

In [25]:
# Classes to handle, ordered by label
with (dataset_dir/'testing_list.txt').open() as f:
    testing_list = f.read().splitlines()
CLASSES = os.listdir(dataset_dir/"audio")[:3]
x_train = []
y_train = []
x_test = []
y_test = []
audiopath = dataset_dir/'audio'
for recording in audiopath.glob(f'**/*.wav'):
    if not recording.parent.name in CLASSES: # Ignore unused classes
        continue
    
    label = CLASSES.index(recording.parent.name) # Assign class number
    with wave.open(str(recording)) as f: # Read wave file
        data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16).copy() # As 16-bit signed integer

    data = data.astype(np.float32) # Convert to 32-bit floating-point
    data.resize((10000, 2)) # Resize to 0s (10kHz) with zero-padding, 1 channel
    if str(recording.relative_to(audiopath)).replace('\\','/') in testing_list: # Assign to test set if file in test list
        x_test.append(data)
        y_test.append(label)
    else:
        x_train.append(data)
        y_train.append(label)

print(f'Loaded {len(x_train)} training samples and {len(x_test)} testing samples')
x_train = np.array(x_train)
y_train = to_categorical(np.array(y_train))
x_test = np.array(x_test)
y_test = to_categorical(np.array(y_test))

Loaded 3987 training samples and 441 testing samples


## Prepare for inference with fixed-point Q7.9 samples by scaling input data accordingly

In [26]:
FIXED_POINT = 9
x_train /= 2**FIXED_POINT
x_test  /= 2**FIXED_POINT

## Export small dataset (250 random vectors)

In [27]:
perms = np.random.permutation(len(y_test))[0:250]
x_test_250 = x_test[perms]
y_test_250 = y_test[perms]
np.savetxt('x_test_gsc_250a.csv', x_test_250.reshape((x_test_250.shape[0], -1)), delimiter=',', fmt='%s')
np.savetxt('y_test_gsc_250a.csv', y_test_250, delimiter=',', fmt='%s')

## Build model M5

In [39]:
model = Sequential()

model.add(Input(shape=(10000, 2)))
model.add(Conv1D(filters=8, kernel_size=30, activation='relu',strides=10))
model.add(MaxPool1D(pool_size=10))
model.add(Conv1D(filters=8, kernel_size=3, activation='relu'))
model.add(MaxPool1D(pool_size=4))
model.add(Conv1D(filters=16, kernel_size=3, activation='relu'))
model.add(MaxPool1D(pool_size=4))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(MaxPool1D(pool_size=1))
model.add(AvgPool1D())
model.add(Flatten())
model.add(Dense(units=3))
model.add(Activation('softmax')) # SoftMax activation needs to be separate from Dense to remove it later on
# EXPLORE Learning Rate
opt = tf.keras.optimizers.Adam(learning_rate=10e-3)
model.summary()
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_16 (Conv1D)          (None, 998, 8)            488       
                                                                 
 max_pooling1d_16 (MaxPoolin  (None, 99, 8)            0         
 g1D)                                                            
                                                                 
 conv1d_17 (Conv1D)          (None, 97, 8)             200       
                                                                 
 max_pooling1d_17 (MaxPoolin  (None, 24, 8)            0         
 g1D)                                                            
                                                                 
 conv1d_18 (Conv1D)          (None, 22, 16)            400       
                                                                 
 max_pooling1d_18 (MaxPoolin  (None, 5, 16)           

## Train model

In [41]:
model.fit(x_train, y_train, epochs=9, batch_size=100, validation_data=(x_test, y_test))

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<keras.callbacks.History at 0x2a36b58a590>

## Evaluate model on test dataset

In [42]:
model.evaluate(x_test, y_test, verbose=2)
pred_test = model.predict(x_test)
print(tf.math.confusion_matrix(y_test.argmax(axis=1), pred_test.argmax(axis=1)))

14/14 - 0s - loss: 0.6614 - categorical_accuracy: 0.7098 - 72ms/epoch - 5ms/step
tf.Tensor(
[[103  28  16]
 [ 18 115  14]
 [ 25  27  95]], shape=(3, 3), dtype=int32)


In [43]:
model.evaluate(x_test_250, y_test_250, verbose=2)
pred_test_250 = model.predict(x_test_250)
print(tf.math.confusion_matrix(y_test_250.argmax(axis=1), pred_test_250.argmax(axis=1)))

8/8 - 0s - loss: 0.6870 - categorical_accuracy: 0.6960 - 45ms/epoch - 6ms/step
tf.Tensor(
[[59 15  9]
 [10 64  9]
 [16 17 51]], shape=(3, 3), dtype=int32)


## Save trained model

In [44]:
model.save('lab_gsc.h5')

## Remove SoftMax layer

In [45]:
model = tf.keras.Model(model.input, model.layers[-2].output, name=model.name)

## Install MicroAI for C inference code generation (kerascnn2c module)

In [46]:
%pip install https://bitbucket.org/edge-team-leat/microai_public/get/6adfbcb347d3.zip#subdirectory=third_party/kerascnn2c_fixed
import kerascnn2c

Collecting https://bitbucket.org/edge-team-leat/microai_public/get/6adfbcb347d3.zip#subdirectory=third_party/kerascnn2c_fixed
  Downloading https://bitbucket.org/edge-team-leat/microai_public/get/6adfbcb347d3.zip (1.9 MB)
     ---------------------------------------- 1.9/1.9 MB 3.7 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Note: you may need to restart the kernel to use updated packages.


## Generate C code for the trained model with 16-bit fixed-point representation

In [47]:
res = kerascnn2c.Converter(output_path=Path('gsc_output_fixed'),
                           fixed_point=FIXED_POINT, # Number of bits for the fractional part, Q7.9 format
                           number_type='int16_t', # Data type for weights/activations (16 bits quantization)
                           long_number_type='int32_t', # Data type for intermediate results
                           number_min=-(2**15), # Minimum value for 16-bit signed integers
                           number_max=(2**15)-1 # Maximum value for 16-bit signed integers
                          ).convert_model(model)
with open('gsc_model_fixed.h', 'w') as f:
    f.write(res)

———————————————————————————————————————————————————————————————————————————————————————————————————————
Inputs                           | Layer                            | Outputs                         
———————————————————————————————————————————————————————————————————————————————————————————————————————
                                 | input_5                          | conv1d_16                       
-------------------------------------------------------------------------------------------------------
input_5                          | conv1d_16                        | max_pooling1d_16                
-------------------------------------------------------------------------------------------------------
conv1d_16                        | max_pooling1d_16                 | conv1d_17                       
-------------------------------------------------------------------------------------------------------
max_pooling1d_16                 | conv1d_17                        

## Compile the 16-bit fixed-point C code for x86 and evaluate on small dataset

uncomment main.cpp

In [48]:

!g++ -Wall -Wextra -pedantic -Ofast -o gsc_fixed -Igsc_output_fixed/ gsc_output_fixed/model.c main.cpp 


gsc_output_fixed/model.c: In function 'void cnn(const number_t (*)[10000], number_t*)':
     activations1.average_pooling1d_4_output,
     ~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~


In [49]:
!"./gsc_fixed" x_test_gsc_250a.csv y_test_gsc_250a.csv

Testing accuracy: 0.696


comment main.cpp

## Test 2 : Load raw spoken digits data from Xeno Canto with the yellow hammer

In [50]:
# Classes to handle, ordered by label
with (dataset_dir/'testing_list.txt').open() as f:
    testing_list = f.read().splitlines()
CLASSES = os.listdir(dataset_dir/"audio")
x_train = []
y_train = []
x_test = []
y_test = []
audiopath = dataset_dir/'audio'
for recording in audiopath.glob(f'**/*.wav'):
    if not recording.parent.name in CLASSES: # Ignore unused classes
        continue
    
    label = CLASSES.index(recording.parent.name) # Assign class number
    with wave.open(str(recording)) as f: # Read wave file
        data = np.frombuffer(f.readframes(f.getnframes()), dtype=np.int16).copy() # As 16-bit signed integer

    data = data.astype(np.float32) # Convert to 32-bit floating-point
    data.resize((10000, 2)) # Resize to 0s (10kHz) with zero-padding, 1 channel
    if str(recording.relative_to(audiopath)).replace('\\','/') in testing_list: # Assign to test set if file in test list
        x_test.append(data)
        y_test.append(label)
    else:
        x_train.append(data)
        y_train.append(label)

print(f'Loaded {len(x_train)} training samples and {len(x_test)} testing samples')
x_train = np.array(x_train)
y_train = to_categorical(np.array(y_train))
x_test = np.array(x_test)
y_test = to_categorical(np.array(y_test))

Loaded 5316 training samples and 588 testing samples


## Prepare for inference with fixed-point Q7.9 samples by scaling input data accordingly

In [51]:
FIXED_POINT = 9
x_train /= 2**FIXED_POINT
x_test  /= 2**FIXED_POINT

In [52]:
perms = np.random.permutation(len(y_test))[0:250]
x_test_250 = x_test[perms]
y_test_250 = y_test[perms]
np.savetxt('x_test_gsc_250b.csv', x_test_250.reshape((x_test_250.shape[0], -1)), delimiter=',', fmt='%s')
np.savetxt('y_test_gsc_250b.csv', y_test_250, delimiter=',', fmt='%s')

In [53]:
model = Sequential()

model.add(Input(shape=(10000, 2)))
model.add(Conv1D(filters=8, kernel_size=30, activation='relu',strides=10))
model.add(MaxPool1D(pool_size=10))
model.add(Conv1D(filters=8, kernel_size=3, activation='relu'))
model.add(MaxPool1D(pool_size=4))
model.add(Conv1D(filters=16, kernel_size=3, activation='relu'))
model.add(MaxPool1D(pool_size=4))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(MaxPool1D(pool_size=1))
model.add(AvgPool1D())
model.add(Flatten())
model.add(Dense(units=4))
model.add(Activation('softmax')) # SoftMax activation needs to be separate from Dense to remove it later on
# EXPLORE Learning Rate
opt = tf.keras.optimizers.Adam(learning_rate=10e-3)
model.summary()
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_20 (Conv1D)          (None, 998, 8)            488       
                                                                 
 max_pooling1d_20 (MaxPoolin  (None, 99, 8)            0         
 g1D)                                                            
                                                                 
 conv1d_21 (Conv1D)          (None, 97, 8)             200       
                                                                 
 max_pooling1d_21 (MaxPoolin  (None, 24, 8)            0         
 g1D)                                                            
                                                                 
 conv1d_22 (Conv1D)          (None, 22, 16)            400       
                                                                 
 max_pooling1d_22 (MaxPoolin  (None, 5, 16)           

In [56]:
model.fit(x_train, y_train, epochs=20, batch_size=50, validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x29fac701240>

## Evaluate model on test dataset

In [57]:
model.evaluate(x_test, y_test, verbose=2)
pred_test = model.predict(x_test)
print(tf.math.confusion_matrix(y_test.argmax(axis=1), pred_test.argmax(axis=1)))

19/19 - 0s - loss: 0.9575 - categorical_accuracy: 0.5918 - 92ms/epoch - 5ms/step
tf.Tensor(
[[ 84  18  13  32]
 [ 17 103  13  14]
 [ 15  18  87  27]
 [ 20  20  33  74]], shape=(4, 4), dtype=int32)


In [58]:
model.evaluate(x_test_250, y_test_250, verbose=2)
pred_test_250 = model.predict(x_test_250)
print(tf.math.confusion_matrix(y_test_250.argmax(axis=1), pred_test_250.argmax(axis=1)))

8/8 - 0s - loss: 0.9478 - categorical_accuracy: 0.5920 - 63ms/epoch - 8ms/step
tf.Tensor(
[[37  7  3 17]
 [ 5 43  6  3]
 [ 6  9 37 11]
 [ 7 14 14 31]], shape=(4, 4), dtype=int32)


## Save trained model

In [59]:
model.save('lab_gsc.h5')

## Remove SoftMax layer

In [60]:
model = tf.keras.Model(model.input, model.layers[-2].output, name=model.name)

## Generate C code for the trained model with 16-bit fixed-point representation

In [61]:
res = kerascnn2c.Converter(output_path=Path('gsc_output_fixed'),
                           fixed_point=FIXED_POINT, # Number of bits for the fractional part, Q7.9 format
                           number_type='int16_t', # Data type for weights/activations (16 bits quantization)
                           long_number_type='int32_t', # Data type for intermediate results
                           number_min=-(2**15), # Minimum value for 16-bit signed integers
                           number_max=(2**15)-1 # Maximum value for 16-bit signed integers
                          ).convert_model(model)
with open('gsc_model_fixed.h', 'w') as f:
    f.write(res)

———————————————————————————————————————————————————————————————————————————————————————————————————————
Inputs                           | Layer                            | Outputs                         
———————————————————————————————————————————————————————————————————————————————————————————————————————
                                 | input_6                          | conv1d_20                       
-------------------------------------------------------------------------------------------------------
input_6                          | conv1d_20                        | max_pooling1d_20                
-------------------------------------------------------------------------------------------------------
conv1d_20                        | max_pooling1d_20                 | conv1d_21                       
-------------------------------------------------------------------------------------------------------
max_pooling1d_20                 | conv1d_21                        

## Compile the 16-bit fixed-point C code for x86 and evaluate on small dataset

uncomment main.cpp

In [66]:

!g++ -Wall -Wextra -pedantic -Ofast -o gsc_fixed -Igsc_output_fixed/ gsc_output_fixed/model.c main.cpp 


gsc_output_fixed/model.c: In function 'void cnn(const number_t (*)[10000], number_t*)':
     activations1.average_pooling1d_5_output,
     ~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~


In [67]:
!"./gsc_fixed" x_test_gsc_250b.csv y_test_gsc_250b.csv

Testing accuracy: 0.588


comment main.cpp