### Initial Steps:

1. Upload `kaggle.json`.
2. Make an empty directory `Others`.
3. Upload the folder `Manasi` which has Manasi's recordings.

### Downloading the Dataset

In [None]:
!pip install -q kaggle

In [None]:
!mkdir ~/.kaggle

In [None]:
!mv kaggle.json ~/.kaggle/

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c tensorflow-speech-recognition-challenge

Downloading tensorflow-speech-recognition-challenge.zip to /content
100% 3.49G/3.50G [00:45<00:00, 147MB/s]
100% 3.50G/3.50G [00:45<00:00, 82.5MB/s]


In [None]:
!unzip -q tensorflow-speech-recognition-challenge.zip

### Importing Libraries


In [None]:
!pip install ibm-watson

Collecting ibm-watson
  Downloading ibm-watson-7.0.1.tar.gz (389 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.3/389.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ibm-cloud-sdk-core==3.*,>=3.3.6 (from ibm-watson)
  Downloading ibm-cloud-sdk-core-3.18.0.tar.gz (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting urllib3<2.0.0,>=1.26.18 (from ibm-cloud-sdk-core==3.*,>=3.3.6->ibm-watson)
  Downloading urllib3-1.26.18-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.8/143

In [None]:
import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd
import shutil

import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import librosa
from IPython.display import Audio

from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
import librosa.display

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd
import joblib

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from ibm_watson import SpeechToTextV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

### Loading the Data

In [None]:
!apt-get install -y p7zip-full
!7z x /content/train.7z

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
p7zip-full is already the newest version (16.02+dfsg-8).
0 upgraded, 0 newly installed, 0 to remove and 10 not upgraded.

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /content/                   1 file, 1121103842 bytes (1070 MiB)

Extracting archive: /content/train.7z
--
Path = /content/train.7z
Type = 7z
Physical Size = 1121103842
Headers Size = 389133
Method = Delta LZMA2:24
Solid = +
Blocks = 2

  0%      0% 38 - train/audio/_background_noise_/doing_the_dishes.wav                                                               0% 40

### Data Preparation

Copying all audio files from 'go', 'left' and 'right' directory to 'Others' folder.

In [None]:
counter = 0

def move_audios(source_folder, destination_folder):
  for filename in os.listdir(source_folder):
    if filename.endswith(".wav"):
      source_path = os.path.join(source_folder, filename)
      global counter
      counter+= 1
      destination_path = os.path.join(destination_folder, str(counter) + filename)
      shutil.copy2(source_path, destination_path)
  print(f'Copied files from {source_folder} to {destination_folder}')

move_audios('/content/train/audio/go', '/content/Others')
move_audios('/content/train/audio/left', '/content/Others')
move_audios('/content/train/audio/right', '/content/Others')

Copied files from /content/train/audio/go to /content/Others
Copied files from /content/train/audio/left to /content/Others
Copied files from /content/train/audio/right to /content/Others


Verifying the Transfer

In [None]:
def count_files(folder):
  counter = 0
  for filename in os.listdir(folder):
    if filename.endswith(".wav"):
      counter+=1
  print(f'Number of files in {folder}:', counter)

count_files('/content/train/audio/go')
count_files('/content/train/audio/left')
count_files('/content/train/audio/right')
count_files('/content/Others')

Number of files in /content/train/audio/go: 2372
Number of files in /content/train/audio/left: 2353
Number of files in /content/train/audio/right: 2367
Number of files in /content/Others: 7092


Expanding `Manasi` audio recording dataset.

In [None]:
def copy_files(source_folder, num_copies):
    counter = 0
    for filename in os.listdir(source_folder):
        if filename.endswith(".wav"):
            source_path = os.path.join(source_folder, filename)

            for i in range(num_copies):
                random_name = f"{counter}.wav"
                counter+= 1
                destination_path = os.path.join(source_folder, random_name)
                shutil.copy2(source_path, destination_path)

    print(f"{num_copies} random copies of each WAV file created in '{source_folder}'.")

source_folder = "/content/Manasi"
num_copies = 236
copy_files(source_folder, num_copies)
count_files('/content/Manasi')

236 random copies of each WAV file created in '/content/Manasi'.
Number of files in /content/Manasi: 7110


### Extracting MFCCs from Audio Files

In [None]:
header = []
header.extend([f'mfcc{i}' for i in range(1, 21)])
header.append('label')
header

['mfcc1',
 'mfcc2',
 'mfcc3',
 'mfcc4',
 'mfcc5',
 'mfcc6',
 'mfcc7',
 'mfcc8',
 'mfcc9',
 'mfcc10',
 'mfcc11',
 'mfcc12',
 'mfcc13',
 'mfcc14',
 'mfcc15',
 'mfcc16',
 'mfcc17',
 'mfcc18',
 'mfcc19',
 'mfcc20',
 'label']

In [None]:
def mfcc_extractor(source_folder):
  df = pd.DataFrame(columns=header)
  for filename in os.listdir(source_folder):
          file_name = source_folder + '/' + filename
          y,sr = librosa.load(file_name, mono=True, duration=1)
          mfcc = librosa.feature.mfcc(y=y, sr=sr)
          mean_mfcc = []
          for e in mfcc:
            mean_mfcc.append(np.mean(e))
          mean_mfcc.append(source_folder.split('/')[-1])
          df = df.append(pd.DataFrame([mean_mfcc], columns=header), ignore_index=True)
  return df

df_a = mfcc_extractor('/content/Others')
df_b = mfcc_extractor('/content/Manasi')

display(df_a)
display(df_b)

Unnamed: 0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,-340.869629,119.426872,-40.449615,43.707653,-7.435071,20.602591,-6.668093,14.847510,1.014987,11.119338,...,-3.428279,19.821747,-3.434635,8.518586,3.731715,7.275608,0.965147,2.976894,4.656776,Others
1,-448.516205,80.936378,-19.070787,6.020117,-11.507313,-5.487815,-4.879198,-11.293888,-7.449798,2.781065,...,-3.952448,6.166394,-7.271213,2.190345,-7.274241,-2.127176,-0.669660,-8.835884,-1.646760,Others
2,-276.624725,125.389549,-28.572985,35.782375,-9.117649,21.904037,-6.083468,9.221219,9.078003,14.618130,...,-7.381569,2.778253,-0.123015,9.510960,-5.780591,5.441728,5.026753,-2.719311,5.298777,Others
3,-289.382965,112.436310,-38.801777,29.734760,-22.343855,26.851585,-15.934223,9.543270,4.554204,8.119537,...,-16.523462,4.635334,-7.423292,4.148916,-2.529671,1.173305,-9.393332,2.783011,3.279809,Others
4,-308.935974,76.120369,-32.642555,1.190896,-33.551445,-0.605611,-23.716413,-7.117409,-18.837067,-5.439693,...,-13.649323,2.565130,-10.151177,-0.938908,-17.222439,-6.294281,-9.383703,-8.385914,-4.270711,Others
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7087,-516.875549,60.240623,-24.902458,34.606621,-34.494938,-4.098581,-3.284007,-12.563572,-10.657320,-2.086535,...,-17.824751,-4.148757,-7.333536,-4.801960,-1.423499,-5.312108,-3.545270,-3.618134,-1.156922,Others
7088,-459.144135,60.829285,-39.998478,-5.488566,-51.428757,-35.626884,-35.402077,-23.210337,3.075179,8.345455,...,-7.212066,8.889968,-2.384934,9.280100,-4.798639,-1.819007,5.897075,3.865324,3.438741,Others
7089,-450.028290,147.081131,-26.480137,41.253513,-14.289305,22.216070,1.304536,15.137282,-1.774724,-0.650521,...,-1.441314,7.679136,-6.150804,3.801738,-2.839072,3.577706,-2.186460,-2.339494,2.616156,Others
7090,-248.608978,147.158142,-32.777729,19.075912,-9.176285,-2.710263,-33.748001,-12.436738,-10.479805,5.609719,...,-14.388505,5.838201,-3.541405,-0.081939,-5.287097,-3.656764,-5.133524,1.061682,2.005845,Others


Unnamed: 0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,-590.985291,47.997990,14.633166,-1.816540,-15.219229,-4.899289,-17.899782,-13.753470,-16.133863,-9.376106,...,-5.288478,-7.358153,-5.571167,-6.179213,0.072655,-8.950734,-4.666564,-5.022236,-6.551546,Manasi
1,-593.389648,61.828323,16.449974,-7.772598,-17.982878,-8.775875,-6.570965,-14.307156,-22.721165,-15.647150,...,-3.607072,-6.225061,-4.011949,-3.321737,-6.159984,-9.484513,-3.803367,-5.422918,-5.459632,Manasi
2,-583.654724,38.042332,4.684601,6.897035,-11.615083,-11.585680,-23.535675,-18.293205,-8.038123,-5.873096,...,-4.221819,-5.825244,-4.105130,-10.642753,0.612530,-8.494279,-6.638670,-3.008335,-2.089005,Manasi
3,-665.706360,70.307030,17.568184,-1.750926,-20.652100,-16.507679,-13.660543,-10.469974,-13.375977,-12.110925,...,-13.654526,-13.745111,-3.611436,-7.895039,-6.275577,-8.433453,-4.339338,-10.097034,-6.274808,Manasi
4,-553.460205,53.987717,19.969641,-5.119381,-18.355339,-8.174364,-13.391295,-15.716059,-17.169828,-15.130288,...,-2.302924,-5.451685,1.831648,-5.062371,-4.817216,-4.312774,-1.608142,-4.664159,-5.100047,Manasi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7105,-595.564514,30.030180,1.567706,11.745807,-7.672833,-12.088867,-17.303549,-21.146194,-7.406392,-14.333838,...,-4.098935,-4.591977,-8.067422,-10.131382,-5.894920,-12.529480,-8.848062,-7.338245,2.144095,Manasi
7106,-647.134705,64.158310,-8.503198,1.999617,-9.860838,-9.615888,-18.604460,-19.727381,-12.903796,-9.081964,...,-10.159987,-13.430134,-6.593185,-4.565743,-4.233055,-12.511898,-5.645334,-6.769784,-3.000241,Manasi
7107,-622.407837,60.772804,-7.036108,2.289425,-7.372377,-8.989063,-17.079058,-23.133156,-12.185425,-12.629683,...,-7.336113,-10.514457,-7.527379,-2.045815,-3.663898,-11.588941,-3.953467,-3.718553,-1.322787,Manasi
7108,-626.331848,65.293968,-12.951875,2.146328,-14.191212,-15.668148,-18.999125,-19.443169,-10.301560,-8.329764,...,-11.597652,-13.760733,-6.948579,-2.411735,-5.953706,-13.466730,-5.894031,-7.202748,-3.247572,Manasi


We have MFCCs for audios from both type of speakers. We'll concatenate the dataframes and put it through a clustering algorithm.

In [None]:
df_final = pd.concat([df_a, df_b])
df_final

Unnamed: 0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,-340.869629,119.426872,-40.449615,43.707653,-7.435071,20.602591,-6.668093,14.847510,1.014987,11.119338,...,-3.428279,19.821747,-3.434635,8.518586,3.731715,7.275608,0.965147,2.976894,4.656776,Others
1,-448.516205,80.936378,-19.070787,6.020117,-11.507313,-5.487815,-4.879198,-11.293888,-7.449798,2.781065,...,-3.952448,6.166394,-7.271213,2.190345,-7.274241,-2.127176,-0.669660,-8.835884,-1.646760,Others
2,-276.624725,125.389549,-28.572985,35.782375,-9.117649,21.904037,-6.083468,9.221219,9.078003,14.618130,...,-7.381569,2.778253,-0.123015,9.510960,-5.780591,5.441728,5.026753,-2.719311,5.298777,Others
3,-289.382965,112.436310,-38.801777,29.734760,-22.343855,26.851585,-15.934223,9.543270,4.554204,8.119537,...,-16.523462,4.635334,-7.423292,4.148916,-2.529671,1.173305,-9.393332,2.783011,3.279809,Others
4,-308.935974,76.120369,-32.642555,1.190896,-33.551445,-0.605611,-23.716413,-7.117409,-18.837067,-5.439693,...,-13.649323,2.565130,-10.151177,-0.938908,-17.222439,-6.294281,-9.383703,-8.385914,-4.270711,Others
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7105,-595.564514,30.030180,1.567706,11.745807,-7.672833,-12.088867,-17.303549,-21.146194,-7.406392,-14.333838,...,-4.098935,-4.591977,-8.067422,-10.131382,-5.894920,-12.529480,-8.848062,-7.338245,2.144095,Manasi
7106,-647.134705,64.158310,-8.503198,1.999617,-9.860838,-9.615888,-18.604460,-19.727381,-12.903796,-9.081964,...,-10.159987,-13.430134,-6.593185,-4.565743,-4.233055,-12.511898,-5.645334,-6.769784,-3.000241,Manasi
7107,-622.407837,60.772804,-7.036108,2.289425,-7.372377,-8.989063,-17.079058,-23.133156,-12.185425,-12.629683,...,-7.336113,-10.514457,-7.527379,-2.045815,-3.663898,-11.588941,-3.953467,-3.718553,-1.322787,Manasi
7108,-626.331848,65.293968,-12.951875,2.146328,-14.191212,-15.668148,-18.999125,-19.443169,-10.301560,-8.329764,...,-11.597652,-13.760733,-6.948579,-2.411735,-5.953706,-13.466730,-5.894031,-7.202748,-3.247572,Manasi


### K Means Clustering

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0)

y = kmeans.fit_predict(df_final[header[:-1]])
df_final['Predicted Speaker'] = y

decoded_labels = []

for i in list(df_final['Predicted Speaker']):
  if i == 1:
    decoded_labels.append('Others')
  else:
    decoded_labels.append('Manasi')

df_final['Predicted Speaker'] = decoded_labels
df_final

Unnamed: 0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,...,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label,Predicted Speaker
0,-340.869629,119.426872,-40.449615,43.707653,-7.435071,20.602591,-6.668093,14.847510,1.014987,11.119338,...,19.821747,-3.434635,8.518586,3.731715,7.275608,0.965147,2.976894,4.656776,Others,Others
1,-448.516205,80.936378,-19.070787,6.020117,-11.507313,-5.487815,-4.879198,-11.293888,-7.449798,2.781065,...,6.166394,-7.271213,2.190345,-7.274241,-2.127176,-0.669660,-8.835884,-1.646760,Others,Others
2,-276.624725,125.389549,-28.572985,35.782375,-9.117649,21.904037,-6.083468,9.221219,9.078003,14.618130,...,2.778253,-0.123015,9.510960,-5.780591,5.441728,5.026753,-2.719311,5.298777,Others,Others
3,-289.382965,112.436310,-38.801777,29.734760,-22.343855,26.851585,-15.934223,9.543270,4.554204,8.119537,...,4.635334,-7.423292,4.148916,-2.529671,1.173305,-9.393332,2.783011,3.279809,Others,Others
4,-308.935974,76.120369,-32.642555,1.190896,-33.551445,-0.605611,-23.716413,-7.117409,-18.837067,-5.439693,...,2.565130,-10.151177,-0.938908,-17.222439,-6.294281,-9.383703,-8.385914,-4.270711,Others,Others
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7105,-595.564514,30.030180,1.567706,11.745807,-7.672833,-12.088867,-17.303549,-21.146194,-7.406392,-14.333838,...,-4.591977,-8.067422,-10.131382,-5.894920,-12.529480,-8.848062,-7.338245,2.144095,Manasi,Manasi
7106,-647.134705,64.158310,-8.503198,1.999617,-9.860838,-9.615888,-18.604460,-19.727381,-12.903796,-9.081964,...,-13.430134,-6.593185,-4.565743,-4.233055,-12.511898,-5.645334,-6.769784,-3.000241,Manasi,Manasi
7107,-622.407837,60.772804,-7.036108,2.289425,-7.372377,-8.989063,-17.079058,-23.133156,-12.185425,-12.629683,...,-10.514457,-7.527379,-2.045815,-3.663898,-11.588941,-3.953467,-3.718553,-1.322787,Manasi,Manasi
7108,-626.331848,65.293968,-12.951875,2.146328,-14.191212,-15.668148,-18.999125,-19.443169,-10.301560,-8.329764,...,-13.760733,-6.948579,-2.411735,-5.953706,-13.466730,-5.894031,-7.202748,-3.247572,Manasi,Manasi


In [None]:
df_final['Predicted Speaker'].value_counts()

Manasi    8786
Others    5416
Name: Predicted Speaker, dtype: int64

In [None]:
score = accuracy_score(df_final['Predicted Speaker'], df_final['label'])
score

0.8819884523306577

# The results are 88.19% accurate.

### Testing it for 1 Audio

In [None]:
# generating mfcc

file_name = '/content/train/audio/go/004ae714_nohash_0.wav'

y,sr = librosa.load(file_name, mono=True, duration=5)
mfcc = librosa.feature.mfcc(y=y, sr=sr)
mean_mfcc = []
for e in mfcc:
  mean_mfcc.append(np.mean(e))

mean_mfcc

[-438.0618,
 68.250595,
 -79.96585,
 83.998764,
 -36.64955,
 18.09715,
 -0.06727524,
 -1.9219625,
 1.5303146,
 9.564461,
 8.683585,
 4.5434747,
 12.625023,
 1.9322301,
 14.746313,
 -7.3965197,
 5.5047846,
 4.8438864,
 7.804177,
 3.1512775]

In [None]:
df_test = pd.DataFrame(columns=header)
df_test = df_test.append(pd.DataFrame([mean_mfcc], columns=header[:-1]), ignore_index=True)
df_test

Unnamed: 0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,-438.061798,68.250595,-79.965851,83.998764,-36.649551,18.097151,-0.067275,-1.921962,1.530315,9.564461,...,4.543475,12.625023,1.93223,14.746313,-7.39652,5.504785,4.843886,7.804177,3.151278,


In [None]:
# making predictions
import joblib
kmeans = joblib.load('speaker_identifier.pkl')
speaker_pred = kmeans.predict(df_test[header[:-1]])
speaker_pred

array([1], dtype=int32)

In [None]:
if speaker_pred == 1:
  decoded_label = 'Others'
else:
  decoded_label = 'Manasi'

print('Predicted Speaker:', decoded_label)

Predicted Speaker: Others


The prediction is correct.

### Converting Speech to Text to Extract the Command

In [None]:
apiUrl = "https://api.eu-gb.speech-to-text.watson.cloud.ibm.com/instances/5f9e33da-3d8f-4924-9b18-2ef9c3dd288d"
myKey = "7NwfZMJOeoVniUj5-XIFYclesdc0VjHzkPZPDBigsD8Y"

In [None]:
auth = IAMAuthenticator(myKey)
Speech2Text = SpeechToTextV1(authenticator = auth)
Speech2Text.set_service_url(apiUrl)

In [None]:
with open("/content/Manasi/179.wav", mode="rb") as wav:
    response = Speech2Text.recognize(audio=wav, content_type="audio/wav")
    recognized_text = response.result['results'][0]['alternatives'][0]['transcript']  ## try & except

In [None]:
recognized_text

'go '

### Listening to the Audio

In [None]:
signal, sample_rate = librosa.load('/content/train/audio/go/004ae714_nohash_0.wav')
Audio(signal, rate=sample_rate)

### Returning Command

In [None]:
if 'right' in recognized_text or 'write' in recognized_text:
  command = 'right'
elif 'go' in recognized_text:
  command = 'go'
elif 'left' in recognized_text:
  command = 'left'
else:
  command = 'record again'

print('Command is:', command)

Command is: go


### Saving the Model

In [None]:
joblib.dump(kmeans, 'speaker_identifier.pkl')

['speaker_identifier.pkl']

In [None]:
response.result

{'result_index': 0,
 'results': [{'final': True,
   'alternatives': [{'transcript': 'go ', 'confidence': 0.87}]}]}