## check GPU availabilty

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Number of GPUs Available: ", len(physical_devices))
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

Number of GPUs Available:  1


## Paths

In [None]:
CUR_DIR = '/content/' #this may be different runnig local
MAT_DIR= CUR_DIR+'mat/'
TRAIN_MAT_PATH = MAT_DIR + 'train/train/'
TEST_MAT_PATH = MAT_DIR + 'test/test/'
VOICE_DIR = CUR_DIR+'voice/'
TRAIN_VOICE_PATH = VOICE_DIR+'train/train/'
TEST_VOICE_PATH = VOICE_DIR+'test/test/'
DRIVE_PATH = CUR_DIR+'drive/MyDrive/Bachelor_Project/'
DRIVE2_PATH = CUR_DIR+'drive/MyDrive/Bachelor_Project_Part2/'
DRIVE3_PATH = CUR_DIR+'drive/MyDrive/Bachelor_Project_Part3/'
#TRAIN_VOICE_OLD_PATH = CUR_DIR+'train/train/'
#TEST_VOICE_OLD_PATH = CUR_DIR+'test/test/'
#TRAIN_IMG_PATH = CUR_DIR+'train/train-img/'
#TEST_IMG_PATH = CUR_DIR+'test/test-img/'
TRAIN_MAT_PATH = MAT_DIR + 'train/'
TEST_MAT_PATH = MAT_DIR + 'test/'
MOZILLA_PATH= CUR_DIR+'mozilla/'
MOZILLA_VALID_TRAIN = CUR_DIR+'mozilla/cv-valid-train/'
MOZILLA_VALID_DEV = CUR_DIR+'mozilla/cv-valid-dev/'
MOZILLA_VALID_TEST = CUR_DIR+'mozilla/cv-valid-test/'

In [None]:
! mkdir $MAT_DIR
! mkdir $VOICE_DIR
! mkdir $TRAIN_MAT_PATH
! mkdir $TEST_MAT_PATH

## mount google drive

In [None]:
from google.colab import drive
drive.mount(CUR_DIR+'drive/', force_remount=True)
#%cd '{CUR_DIR}drive/MyDrive/Bachelor_Project/'
#%ls

Mounted at /content/drive/


## import libraries



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

from glob import glob
import librosa
import librosa.display
import soundfile as sf
import scipy.signal as signal
import gc
import shutil

## spoken-language-identification dataset


In [None]:
! mkdir ~/.kaggle
! cp $DRIVE_PATH/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download toponowicz/spoken-language-identification

Downloading spoken-language-identification.zip to /content
 29% 4.34G/14.9G [03:33<07:56, 23.9MB/s]

In [None]:
%%capture
!unzip {CUR_DIR}spoken-language-identification.zip -d $VOICE_DIR

### create dataframe

In [None]:
train_data_dir=np.array(glob(TRAIN_VOICE_PATH+"*"))
traindf = pd.DataFrame(train_data_dir,columns = ['file'])
format_len = len('flac')
traindf['ID']= traindf['file'].apply(lambda x: x.split('/')[-1][:-format_len]+'npy')
traindf

In [None]:
test_data_dir=np.array(glob(TEST_VOICE_PATH+"*"))
testdf = pd.DataFrame(test_data_dir,columns = ['file'])
format_len = len('flac')
testdf['ID']= testdf['file'].apply(lambda x: x.split('/')[-1][:-format_len]+'npy')
testdf

In [None]:
traindf['lang'] = traindf['ID'].apply(lambda x: x.split('_')[0])
testdf['lang'] = testdf['ID'].apply(lambda x: x.split('_')[0])

##

### create folders

In [None]:
! mkdir $CUR_DIR/train
! mkdir $CUR_DIR/test

In [None]:
! mv $TRAIN_VOICE_PATH $TRAIN_VOICE_OLD_PATH
! mv $TEST_VOICE_PATH $TEST_VOICE_OLD_PATH

In [None]:
!mkdir $TRAIN_IMG_PATH
!mkdir $TEST_IMG_PATH

mkdir: missing operand
Try 'mkdir --help' for more information.
mkdir: missing operand
Try 'mkdir --help' for more information.


## Mozilla Dataset

In [None]:
! kaggle datasets download mozillaorg/common-voice

In [None]:
!mkdir mozilla

In [None]:
%%capture
!unzip {CUR_DIR}common-voice.zip -d $MOZILLA_PATH

### remove invalid and other datasets

In [None]:
!rm $MOZILLA_PATH/LICENSE.txt
!rm $MOZILLA_PATH/README.txt
!rm -r $MOZILLA_PATH/cv-other*
!rm -r $MOZILLA_PATH/cv-invalid*

In [None]:
valid_train_df = pd.read_csv(MOZILLA_PATH+"cv-valid-train.csv")
valid_train_df

In [None]:
valid_dev_df = pd.read_csv(MOZILLA_PATH+"cv-valid-dev.csv")
valid_dev_df

In [None]:
valid_test_df = pd.read_csv(MOZILLA_PATH+"cv-valid-test.csv")
valid_test_df

In [None]:
def process_mozilla_df(df, path_type):
    #df = df[pd.notnull(df['accent'])]
    df.dropna(subset=['accent'], inplace=True)
    df.rename(columns={'filename': 'file', 'accent': 'lang'}, inplace=True)
    df['ID'] = df['file'].str.split('/', expand=True)[1] + '.npy'
    df['file'] = path_type+df['file']

In [None]:
process_mozilla_df(valid_train_df, MOZILLA_VALID_TRAIN)
process_mozilla_df(valid_dev_df, MOZILLA_VALID_DEV)
process_mozilla_df(valid_test_df, MOZILLA_VALID_TEST)

In [None]:
valid_train_df

Unnamed: 0,file,text,up_votes,down_votes,age,gender,lang,duration,ID
5,/content/mozilla/cv-valid-train/cv-valid-train...,a shepherd may like to travel but he should ne...,1,0,twenties,female,us,,sample-000005.mp3.jpg
8,/content/mozilla/cv-valid-train/cv-valid-train...,put jackie right on the staff,3,0,seventies,male,us,,sample-000008.mp3.jpg
13,/content/mozilla/cv-valid-train/cv-valid-train...,but he had found a guide and didn't want to mi...,1,0,thirties,female,us,,sample-000013.mp3.jpg
14,/content/mozilla/cv-valid-train/cv-valid-train...,as they began to decorate the hallway a silhou...,1,0,sixties,male,england,,sample-000014.mp3.jpg
19,/content/mozilla/cv-valid-train/cv-valid-train...,then they got ahold of some dough and went goofy,1,0,fifties,male,australia,,sample-000019.mp3.jpg
...,...,...,...,...,...,...,...,...,...
195766,/content/mozilla/cv-valid-train/cv-valid-train...,but before i go i want to tell you a little story,1,0,fourties,male,england,,sample-195766.mp3.jpg
195767,/content/mozilla/cv-valid-train/cv-valid-train...,down below in the darkness were hundreds of pe...,3,0,thirties,female,us,,sample-195767.mp3.jpg
195770,/content/mozilla/cv-valid-train/cv-valid-train...,he heard a muffled grating sound and saw the b...,4,0,twenties,male,england,,sample-195770.mp3.jpg
195771,/content/mozilla/cv-valid-train/cv-valid-train...,the englishman said nothing,1,0,thirties,male,england,,sample-195771.mp3.jpg


In [None]:
valid_dev_df

Unnamed: 0,file,text,up_votes,down_votes,age,gender,lang,duration,ID
4,/content/mozilla/cv-valid-dev/cv-valid-dev/sam...,he moved about invisible but everyone could he...,1,0,fourties,female,england,,sample-000004.mp3.jpg
5,/content/mozilla/cv-valid-dev/cv-valid-dev/sam...,but everything had changed,3,0,teens,male,us,,sample-000005.mp3.jpg
8,/content/mozilla/cv-valid-dev/cv-valid-dev/sam...,the shop folks were taking down their shutters...,1,0,twenties,female,canada,,sample-000008.mp3.jpg
9,/content/mozilla/cv-valid-dev/cv-valid-dev/sam...,the teacher thought that he'd taught himself a...,1,0,fifties,female,australia,,sample-000009.mp3.jpg
13,/content/mozilla/cv-valid-dev/cv-valid-dev/sam...,in those days very few of the people had any i...,5,0,twenties,male,us,,sample-000013.mp3.jpg
...,...,...,...,...,...,...,...,...,...
4053,/content/mozilla/cv-valid-dev/cv-valid-dev/sam...,if you start your emails with greetings let me...,1,0,sixties,female,canada,,sample-004053.mp3.jpg
4055,/content/mozilla/cv-valid-dev/cv-valid-dev/sam...,the atmosphere was suffused with the sweet sce...,4,1,fourties,male,england,,sample-004055.mp3.jpg
4067,/content/mozilla/cv-valid-dev/cv-valid-dev/sam...,they reached the center of a large plaza where...,3,0,fourties,female,us,,sample-004067.mp3.jpg
4069,/content/mozilla/cv-valid-dev/cv-valid-dev/sam...,no sense messing up the streets,2,0,thirties,male,england,,sample-004069.mp3.jpg


In [None]:
valid_test_df

Unnamed: 0,file,text,up_votes,down_votes,age,gender,lang,duration,ID
3,/content/mozilla/cv-valid-test/cv-valid-test/s...,down below in the darkness were hundreds of pe...,4,0,twenties,male,us,,sample-000003.mp3.jpg
5,/content/mozilla/cv-valid-test/cv-valid-test/s...,down below in the darkness were hundreds of pe...,4,1,twenties,male,us,,sample-000005.mp3.jpg
8,/content/mozilla/cv-valid-test/cv-valid-test/s...,this was the strangest of all things that ever...,1,0,thirties,male,england,,sample-000008.mp3.jpg
9,/content/mozilla/cv-valid-test/cv-valid-test/s...,it was glaringly hot not a cloud in the sky no...,3,0,fifties,male,us,,sample-000009.mp3.jpg
14,/content/mozilla/cv-valid-test/cv-valid-test/s...,follow the instructions here,1,0,twenties,male,scotland,,sample-000014.mp3.jpg
...,...,...,...,...,...,...,...,...,...
3971,/content/mozilla/cv-valid-test/cv-valid-test/s...,he's on my phone,1,0,teens,male,england,,sample-003971.mp3.jpg
3975,/content/mozilla/cv-valid-test/cv-valid-test/s...,the planet's core is made of solid gold but ev...,1,0,twenties,male,us,,sample-003975.mp3.jpg
3976,/content/mozilla/cv-valid-test/cv-valid-test/s...,we've got her located,1,0,thirties,male,england,,sample-003976.mp3.jpg
3980,/content/mozilla/cv-valid-test/cv-valid-test/s...,i've got to see nicole right away,1,0,fifties,male,australia,,sample-003980.mp3.jpg


## Managing dataset

### Load datasets

In [None]:
first_half_df = pd.read_csv(DRIVE_PATH + "first_half.csv")

In [None]:
first_half_df

Unnamed: 0,file,ID,lang
0,/content/voice/train/train/de_m_df90bffac026f7...,de_m_df90bffac026f705f4a99871d71684f1.fragment...,de
1,/content/voice/train/train/de_m_0345d503166fbc...,de_m_0345d503166fbcb125b6e10e2e804443.fragment...,de
2,/content/voice/train/train/en_m_d2f13c7f37ecec...,en_m_d2f13c7f37ecec67df0f46408c224bc1.fragment...,en
3,/content/voice/train/train/en_m_3a8f30bef6d7b6...,en_m_3a8f30bef6d7b6a1332629491eabcbf0.fragment...,en
4,/content/voice/train/train/en_f_7701e10392ce7b...,en_f_7701e10392ce7bcfba6d2fcca56fb8d9.fragment...,en
...,...,...,...
36535,/content/voice/train/train/de_f_26ddc4e756f612...,de_f_26ddc4e756f61233d74d42cc000c9dae.fragment...,de
36536,/content/voice/train/train/en_m_17b5d499519197...,en_m_17b5d4995191976232356345dfbb82cb.fragment...,en
36537,/content/voice/train/train/es_f_56ef0bde6c92a0...,es_f_56ef0bde6c92a0100da23cdd39a8a112.fragment...,es
36538,/content/voice/train/train/es_f_cc86b9208b120f...,es_f_cc86b9208b120f1f147a9246ab39f5e4.fragment...,es


In [None]:
print(len(first_half_df[first_half_df['lang']  == 'en']))
print(len(first_half_df[first_half_df['lang']  == 'es']))
print(len(first_half_df[first_half_df['lang']  == 'de']))

12163
12200
12177


In [None]:
print(first_half_df["ID"].iloc[1])

de_m_0345d503166fbcb125b6e10e2e804443.fragment25.speed4.npy


In [None]:
%%time
format_len = len('flac')
np.vectorize(spectrogram_mat)(TRAIN_VOICE_PATH, first_half_df['ID'], TRAIN_MAT_PATH)

CPU times: user 36min 28s, sys: 26min 53s, total: 1h 3min 22s
Wall time: 41min


array([None, None, None, ..., None, None, None], dtype=object)

In [None]:
! du -sh $TRAIN_MAT_PATH

7.6G	/content/mat/train/


In [None]:
! rm -r $VOICE_DIR

In [None]:
! mkdir {DRIVE3_PATH}first_half

In [None]:
first_half_df.apply(lambda x: shutil.copy(TRAIN_MAT_PATH+x.ID, DRIVE3_PATH+"first_half/"+x.ID), axis=1)

0        /content/drive/MyDrive/Bachelor_Project_Part3/...
1        /content/drive/MyDrive/Bachelor_Project_Part3/...
2        /content/drive/MyDrive/Bachelor_Project_Part3/...
3        /content/drive/MyDrive/Bachelor_Project_Part3/...
4        /content/drive/MyDrive/Bachelor_Project_Part3/...
                               ...                        
36535    /content/drive/MyDrive/Bachelor_Project_Part3/...
36536    /content/drive/MyDrive/Bachelor_Project_Part3/...
36537    /content/drive/MyDrive/Bachelor_Project_Part3/...
36538    /content/drive/MyDrive/Bachelor_Project_Part3/...
36539    /content/drive/MyDrive/Bachelor_Project_Part3/...
Length: 36540, dtype: object

In [None]:
first_half_df['lang'] = first_half_df['ID'].apply(lambda x: x.split('_')[0])

In [None]:
first_half_df.to_csv(DRIVE_PATH+"first_half.csv")

In [None]:
testdf

Unnamed: 0,file,ID
0,/content/voice/test/test/en_f_67a0cba10d171b24...,en_f_67a0cba10d171b24039a79faa1d4d603.fragment...
1,/content/voice/test/test/en_m_b74b2bf2af570393...,en_m_b74b2bf2af570393cae91f4ed89cece7.fragment...
2,/content/voice/test/test/es_f_50298ab71aaba850...,es_f_50298ab71aaba8508ebeef49d853df11.fragment...
3,/content/voice/test/test/de_f_63f5b79c76cf5a1a...,de_f_63f5b79c76cf5a1a4bbd1c40f54b166e.fragment...
4,/content/voice/test/test/en_f_67a0cba10d171b24...,en_f_67a0cba10d171b24039a79faa1d4d603.fragment...
...,...,...
535,/content/voice/test/test/de_m_923551d571cc4373...,de_m_923551d571cc437382d0294dda2dd0aa.fragment...
536,/content/voice/test/test/en_m_b74b2bf2af570393...,en_m_b74b2bf2af570393cae91f4ed89cece7.fragment...
537,/content/voice/test/test/en_m_b74b2bf2af570393...,en_m_b74b2bf2af570393cae91f4ed89cece7.fragment...
538,/content/voice/test/test/de_m_923551d571cc4373...,de_m_923551d571cc437382d0294dda2dd0aa.fragment...


In [None]:
testdf['lang'] = testdf['ID'].apply(lambda x: x.split('_')[0])

In [None]:
testdf.to_csv(DRIVE_PATH+"testdf.csv")

In [None]:
%%time
format_len = len('flac')
testdf = pd.DataFrame(test_data_dir,columns = ['file'])
testdf['ID']= testdf['file'].apply(lambda x: x.split('/')[-1][:-format_len]+'npy')
np.vectorize(spectrogram_mat)(TEST_VOICE_PATH, testdf['ID'], TEST_MAT_PATH)
testdf

### Move to label directory

In [None]:
labels = ['en','es','de']
for x in labels:
    !mkdir $TRAIN_IMG_PATH/$x
    !mkdir $TEST_IMG_PATH/$x

In [None]:
def move_to_label(name,lang, is_train=True):
    prefix= TRAIN_IMG_PATH
    if not is_train:
        prefix= TEST_IMG_PATH
    shutil.move(prefix+name,prefix+lang+"/"+name)
    #print(prefix+name, "to", prefix+lang+"/"+name)

In [None]:
train_data_dir=np.array(glob(TRAIN_VOICE_PATH+"*"))
test_data_dir=np.array(glob(TEST_VOICE_PATH+"*"))

In [None]:
train_data_dir

array(['/content/voice/train/train/de_m_df90bffac026f705f4a99871d71684f1.fragment28.noise3.flac',
       '/content/voice/train/train/en_f_9a47e3d56398b8108f59385aab8bbe87.fragment21.pitch5.flac',
       '/content/voice/train/train/de_f_2825fa225d6ca4800f0cf0504b76ca65.fragment14.speed6.flac',
       ...,
       '/content/voice/train/train/es_f_68e1dcf1b41edcfa24b402f46ec2a65e.fragment2.speed7.flac',
       '/content/voice/train/train/de_m_3aa01c55ba7ca85cad2977c848eb826f.fragment8.noise6.flac',
       '/content/voice/train/train/de_m_fc6bd6bb9d66a89bb8d8a8a7efa23e6b.fragment9.speed8.flac'],
      dtype='<U88')

In [None]:
print(len(train_data_dir))

73080


In [None]:
print(len(test_data_dir))

540


## Create spectogram

In [None]:
def spectrogram_mat(address_prefix, file, save_prefix, format='flac'):
    clip, sample_rate = librosa.load(address_prefix+file[:-3]+format, sr=None)
    S = librosa.feature.melspectrogram(y=clip, sr=sample_rate)
    SP = librosa.power_to_db(S, ref=np.max)
    np.save(save_prefix+file, SP)

In [None]:
def show_spectrogram_mat(filename, file, is_train=True):
    plt.interactive(False)
    ###clip, sample_rate = librosa.load(filename, sr=None)
    fig = plt.figure(figsize=[0.72,0.72])
    ax = fig.add_subplot(111)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    ###S = librosa.feature.melspectrogram(y=clip, sr=sample_rate)
    ###SP = librosa.power_to_db(S, ref=np.max)
    #S = librosa.feature.mfcc(y=clip, sr=sample_rate)
    S= np.load(filename)
    #librosa.display.specshow(librosa.power_to_db(S, ref=np.max))
    librosa.display.specshow(S)
    prefix= TRAIN_IMG_PATH
    if not is_train:
        prefix= TEST_IMG_PATH
    name  = prefix + file[:-4] + ".jpg"
    #name2  = prefix + file[:-4] + "2.jpg"
    #np.save(name, S)
    #np.save(name2, SP)
    plt.savefig(name, dpi=400, bbox_inches='tight',pad_inches=0)
    plt.close()
    fig.clf()
    plt.close(fig)
    plt.close('all')
    #del filename,name,clip,sample_rate,fig,ax,S

### generate spectrogram for dataframe

In [None]:
%%time
format_len = len('flac')
traindf = pd.DataFrame(train_data_dir,columns = ['file'])
traindf['ID']= traindf['file'].apply(lambda x: x.split('/')[-1][:-format_len]+'npy')
np.vectorize(spectrogram_mat)(TRAIN_VOICE_PATH, traindf['ID'], TRAIN_MAT_PATH)
traindf

CPU times: user 47min 31s, sys: 32min 12s, total: 1h 19min 43s
Wall time: 49min 5s


Unnamed: 0,file,ID
0,/content/voice/train/train/de_m_df90bffac026f7...,de_m_df90bffac026f705f4a99871d71684f1.fragment...
1,/content/voice/train/train/en_f_9a47e3d56398b8...,en_f_9a47e3d56398b8108f59385aab8bbe87.fragment...
2,/content/voice/train/train/de_f_2825fa225d6ca4...,de_f_2825fa225d6ca4800f0cf0504b76ca65.fragment...
3,/content/voice/train/train/en_m_81995ee8a5e990...,en_m_81995ee8a5e990193b7858ec4b158e48.fragment...
4,/content/voice/train/train/en_m_1e512792ebab2b...,en_m_1e512792ebab2bd93b7aacac3d521390.fragment...
...,...,...
73075,/content/voice/train/train/de_f_8e8ed685b0cf9f...,de_f_8e8ed685b0cf9f5bbd13a04629ee64c5.fragment...
73076,/content/voice/train/train/es_f_47bd2e6178465c...,es_f_47bd2e6178465cd745c86c9db5ffe447.fragment...
73077,/content/voice/train/train/es_f_68e1dcf1b41edc...,es_f_68e1dcf1b41edcfa24b402f46ec2a65e.fragment...
73078,/content/voice/train/train/de_m_3aa01c55ba7ca8...,de_m_3aa01c55ba7ca85cad2977c848eb826f.fragment...


In [None]:
%%time
format_len = len('flac')
testdf = pd.DataFrame(test_data_dir,columns = ['file'])
testdf['ID']= testdf['file'].apply(lambda x: x.split('/')[-1][:-format_len]+'npy')
np.vectorize(spectrogram_mat)(TEST_VOICE_PATH, testdf['ID'], TEST_MAT_PATH)
testdf

CPU times: user 29.8 s, sys: 21.6 s, total: 51.3 s
Wall time: 32.7 s


Unnamed: 0,file,ID
0,/content/voice/test/test/en_f_67a0cba10d171b24...,en_f_67a0cba10d171b24039a79faa1d4d603.fragment...
1,/content/voice/test/test/en_m_b74b2bf2af570393...,en_m_b74b2bf2af570393cae91f4ed89cece7.fragment...
2,/content/voice/test/test/es_f_50298ab71aaba850...,es_f_50298ab71aaba8508ebeef49d853df11.fragment...
3,/content/voice/test/test/de_f_63f5b79c76cf5a1a...,de_f_63f5b79c76cf5a1a4bbd1c40f54b166e.fragment...
4,/content/voice/test/test/en_f_67a0cba10d171b24...,en_f_67a0cba10d171b24039a79faa1d4d603.fragment...
...,...,...
535,/content/voice/test/test/de_m_923551d571cc4373...,de_m_923551d571cc437382d0294dda2dd0aa.fragment...
536,/content/voice/test/test/en_m_b74b2bf2af570393...,en_m_b74b2bf2af570393cae91f4ed89cece7.fragment...
537,/content/voice/test/test/en_m_b74b2bf2af570393...,en_m_b74b2bf2af570393cae91f4ed89cece7.fragment...
538,/content/voice/test/test/de_m_923551d571cc4373...,de_m_923551d571cc437382d0294dda2dd0aa.fragment...


In [None]:
! ls $TRAIN_MAT_PATH -1 | wc -l

73080


In [None]:
! du -sh $TRAIN_MAT_PATH

16G	/content/mat/train/


In [None]:
! mkdir {DRIVE3_PATH}"test/"

In [None]:
show_spectrogram_mat('/content/train/train-img/de_f_26ddc4e756f61233d74d42cc000c9dae.fragment23.pitch8.flac2.npy','de_f_26ddc4e756f61233d74d42cc000c9dae.fragment23.pitch8.flac2.npy')

In [None]:
traindf['lang'] = traindf['ID'].apply(lambda x: x.split('_')[0])

In [None]:
move_to_label(traindf['ID'][0], traindf['lang'][0])

In [None]:
np.vectorize(move_to_label, cache=True)(traindf['ID'], traindf['lang'])

### read data from drive

In [None]:
### for first_half
first_half_df=pd.read_csv(DRIVE_PATH+'first_half.csv')
traindf = first_half_df

In [None]:
first_half_df

Unnamed: 0,file,ID,lang
0,/content/voice/train/train/de_m_df90bffac026f7...,de_m_df90bffac026f705f4a99871d71684f1.fragment...,de
1,/content/voice/train/train/de_m_0345d503166fbc...,de_m_0345d503166fbcb125b6e10e2e804443.fragment...,de
2,/content/voice/train/train/en_m_d2f13c7f37ecec...,en_m_d2f13c7f37ecec67df0f46408c224bc1.fragment...,en
3,/content/voice/train/train/en_m_3a8f30bef6d7b6...,en_m_3a8f30bef6d7b6a1332629491eabcbf0.fragment...,en
4,/content/voice/train/train/en_f_7701e10392ce7b...,en_f_7701e10392ce7bcfba6d2fcca56fb8d9.fragment...,en
...,...,...,...
36535,/content/voice/train/train/de_f_26ddc4e756f612...,de_f_26ddc4e756f61233d74d42cc000c9dae.fragment...,de
36536,/content/voice/train/train/en_m_17b5d499519197...,en_m_17b5d4995191976232356345dfbb82cb.fragment...,en
36537,/content/voice/train/train/es_f_56ef0bde6c92a0...,es_f_56ef0bde6c92a0100da23cdd39a8a112.fragment...,es
36538,/content/voice/train/train/es_f_cc86b9208b120f...,es_f_cc86b9208b120f1f147a9246ab39f5e4.fragment...,es


### generate spectrogram

In [None]:
def spectrogram_mat(address_prefix, file, save_prefix, format='flac'):
    clip, sample_rate = librosa.load(address_prefix+file[:-3]+format, sr=None)
    S = librosa.feature.melspectrogram(y=clip, sr=sample_rate)#, n_mels=40, hop_length=221) ##
    SP = librosa.power_to_db(S, ref=np.max)
    np.save(save_prefix+file, SP)

In [None]:
%%time
format_len = len('flac')
np.vectorize(spectrogram_mat)(TRAIN_VOICE_PATH, traindf['ID'], TRAIN_MAT_PATH)
traindf

CPU times: user 1h 1min, sys: 1h 55min 49s, total: 2h 56min 50s
Wall time: 50min 14s


Unnamed: 0,file,ID,lang
0,/content/voice/train/train/de_f_404792fda5c66b...,de_f_404792fda5c66b95903adafd5a5f5d10.fragment...,de
1,/content/voice/train/train/es_m_534a0cf6f6aea8...,es_m_534a0cf6f6aea8e35667a9147d2d0f8d.fragment...,es
2,/content/voice/train/train/es_m_406bb0dc71a220...,es_m_406bb0dc71a22051d900bc55cdfad59e.fragment...,es
3,/content/voice/train/train/de_m_8ea3514af1c014...,de_m_8ea3514af1c01478fbcd004b026fea71.fragment...,de
4,/content/voice/train/train/en_m_17b5d499519197...,en_m_17b5d4995191976232356345dfbb82cb.fragment...,en
...,...,...,...
73075,/content/voice/train/train/de_m_502293f2297515...,de_m_502293f229751563d1768dd2367a6b29.fragment...,de
73076,/content/voice/train/train/de_f_9bd3b58fe6f75d...,de_f_9bd3b58fe6f75dc40e85021914b4b06a.fragment...,de
73077,/content/voice/train/train/de_f_9bd3b58fe6f75d...,de_f_9bd3b58fe6f75dc40e85021914b4b06a.fragment...,de
73078,/content/voice/train/train/de_m_1b27ef4569a51e...,de_m_1b27ef4569a51e82d6b2dfae8d565d25.fragment...,de


In [None]:
%%time
format_len = len('flac')
np.vectorize(spectrogram_mat)(TEST_VOICE_PATH, testdf['ID'], TEST_MAT_PATH)
testdf

CPU times: user 27.1 s, sys: 48.1 s, total: 1min 15s
Wall time: 21.8 s


Unnamed: 0,file,ID,lang
0,/content/voice/test/test/de_f_63f5b79c76cf5a1a...,de_f_63f5b79c76cf5a1a4bbd1c40f54b166e.fragment...,de
1,/content/voice/test/test/es_m_08111ce8d6a7ebc6...,es_m_08111ce8d6a7ebc6cd2c27e62a3d98f0.fragment...,es
2,/content/voice/test/test/es_f_50298ab71aaba850...,es_f_50298ab71aaba8508ebeef49d853df11.fragment...,es
3,/content/voice/test/test/en_m_b74b2bf2af570393...,en_m_b74b2bf2af570393cae91f4ed89cece7.fragment...,en
4,/content/voice/test/test/de_m_923551d571cc4373...,de_m_923551d571cc437382d0294dda2dd0aa.fragment...,de
...,...,...,...
535,/content/voice/test/test/en_f_67a0cba10d171b24...,en_f_67a0cba10d171b24039a79faa1d4d603.fragment...,en
536,/content/voice/test/test/de_m_923551d571cc4373...,de_m_923551d571cc437382d0294dda2dd0aa.fragment...,de
537,/content/voice/test/test/en_m_b74b2bf2af570393...,en_m_b74b2bf2af570393cae91f4ed89cece7.fragment...,en
538,/content/voice/test/test/de_m_923551d571cc4373...,de_m_923551d571cc437382d0294dda2dd0aa.fragment...,de


In [None]:
print(traindf.iloc[0].ID)

es_f_56ef0bde6c92a0100da23cdd39a8a112.fragment13.speed3.npy


In [None]:
gc.collect()

100189

In [None]:
!rm {CUR_DIR}spoken-language-identification.zip

In [None]:
!rm $VOICE_DIR -r

In [None]:
traindf=pd.read_csv(DRIVE_PATH+'traindf.csv')

In [None]:
! ls $TRAIN_MAT_PATH/first_half -1 | wc -l

ls: cannot access '/content/mat/train//first_half': No such file or directory
0


In [None]:
! ls {DRIVE3_PATH}first_half -1 | wc -l

36540


In [None]:
file_paths = traindf['ID'].values
lang = np.array(traindf['lang'].values)

data = [np.load(file_path) for file_path in file_paths]
data = np.stack(data)

In [None]:
%cd $CUR_DIR

In [None]:
cur_shape = data.shape
data = data.reshape((cur_shape[0], cur_shape[1], cur_shape[2], 1))
data.shape

(10000, 128, 431, 1)

In [None]:
! ls {DRIVE_PATH}/train -1 | wc -l

10000


In [None]:
data[0].shape

(128, 431, 1)

## Data Generator

In [None]:
from tensorflow import keras
import numpy as np
import os

class NumpyDataGenerator(keras.utils.Sequence):
    def __init__(self, data_folder, ids, labels, batch_size, shuffle=True, seed=None, rescale=None, subset='training'):
        self.data_folder = data_folder
        ## changed:
        #self.ids = ids
        self.file_list = ids
        #self.labels = labels
        self.file_labels = labels

        self.batch_size = batch_size
        #self.validation_split = validation_split
        self.shuffle = shuffle
        self.seed = seed
        self.rescale = rescale
        self.subset = subset

        #self._split_data()

        if self.shuffle:
            self._shuffle_data()

    #def _split_data(self):
    #    split_index = int(self.validation_split * len(self.ids))
    #    if self.subset == 'training':
    #        self.file_list = self.ids[split_index:]
    #        self.file_labels = self.labels[split_index:]
    #    else:
    #        self.file_list = self.ids[:split_index]
    #        self.file_labels = self.labels[:split_index]

    def _shuffle_data(self):
        if self.seed is not None:
            np.random.seed(self.seed)
        indices = np.random.permutation(len(self.file_list))
        self.file_list = self.file_list[indices]
        self.file_labels = self.file_labels[indices]

    #def _create_spectrogram(self, file_address): #address_prefix, given_file, format='flac'
    #  clip, sample_rate = librosa.load(file_address, sr=None) #address_prefix+given_file[:-3]+format
    #  S = librosa.feature.melspectrogram(y=clip, sr=sample_rate, n_mels=40, hop_length=221)
    #  SP = librosa.power_to_db(S, ref=np.max)
    #  return SP

    def __len__(self):
        return len(self.file_list) // self.batch_size

    def __getitem__(self, idx):
        r = (idx + 1) * self.batch_size
        if r > len(self.file_list):
            r = len(self.file_list)
        batch_files = self.file_list[idx * self.batch_size: r]
        batch_labels = self.file_labels[idx * self.batch_size: r]

        batch_data = np.empty((len(batch_files), *(128, 431))) #(40, 998)
        for i, filename in enumerate(batch_files):
            batch_data[i,] = np.load(os.path.join(self.data_folder, filename))
            #batch_data[i,] = self._create_spectrogram(self.data_folder, filename)
            #batch_data[i,] = self._create_spectrogram(os.path.join(self.data_folder, filename[:-3]+'flac'))

        return batch_data, batch_labels

    def on_epoch_end(self, logs=None):
        if self.shuffle:
            self._shuffle_data()


In [None]:
X_trainid = np.array(traindf['ID'].values)
y_train = np.array(traindf['lang'].values)

from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_train = to_categorical(y_train, 3)

In [None]:
traindf

In [None]:
np.random.seed(42)
shuffle_indices = np.random.permutation(len(X_trainid))
X_trainid = X_trainid[shuffle_indices]
y_train = y_train[shuffle_indices]

In [None]:
import copy
def split_data(X_id, y_label, validation_split):
    split_index = int(validation_split * len(X_id))
    X_train_id = copy.deepcopy(X_id[split_index:])
    y_train_label = copy.deepcopy(y_label[split_index:])
    X_valid_id = copy.deepcopy(X_id[:split_index])
    y_valid_label = copy.deepcopy(y_label[:split_index])
    return X_train_id, y_train_label, X_valid_id, y_valid_label

In [None]:
X_train_id, y_train_label, X_valid_id, y_valid_label = split_data(X_trainid, y_train, 0.2)

In [None]:
print((y_valid_label[:,0] == 1.0).sum())
print((y_valid_label[:,1] == 1.0).sum())
print((y_valid_label[:,2] == 1.0).sum())
print((y_train_label[:,0] == 1.0).sum())
print((y_train_label[:,1] == 1.0).sum())
print((y_train_label[:,2] == 1.0).sum())

4835
4849
4932
19525
19511
19428


In [None]:
print(len(X_train_id), len(y_train_label), len(X_valid_id), len(y_valid_label))

58464 58464 14616 14616


In [None]:
print(X_trainid)

In [None]:
print(y_train)

In [None]:
print(X_trainid[3], y_train[3])

en_m_81995ee8a5e990193b7858ec4b158e48.fragment5.pitch5.npy [0. 1. 0.]


In [None]:
print(str(y_train[0]))

In [None]:
data_folder = TRAIN_MAT_PATH #########
batch_size = 32
validation_split = 0.2
rescale_factor = 1.0 / 255.0

params = {
    'data_folder': data_folder,
    ##'ids' : X_trainid,
    ##'labels': y_train,
    'batch_size': batch_size,
    #'validation_split': validation_split,
    'shuffle': True, #############################
    'seed': 42,
    'rescale': rescale_factor,
}

train_generator = NumpyDataGenerator(subset='training', ids= X_train_id, labels= y_train_label, **params)
valid_generator = NumpyDataGenerator(subset='validation', ids= X_valid_id, labels= y_valid_label, **params)

### prepare test data

In [None]:
X_testid = np.array(testdf['ID'].values)
y_test = np.array(testdf['lang'].values)

# lablel_encoder that was used for train and validation
# so we only use transform
y_test = label_encoder.transform(y_test)
y_test = to_categorical(y_test, 3)

In [None]:
test_data_folder = TEST_MAT_PATH #########
batch_size = 32
#validation_split = 0.2
rescale_factor = 1.0 / 255.0

test_params = {
    'data_folder': test_data_folder,
    ##'ids' : X_testid,
    ##'labels': y_test,
    'batch_size': batch_size,
    #'validation_split': validation_split,
    'shuffle': False, #############################
    'seed': 42,
    'rescale': rescale_factor,
}

test_generator = NumpyDataGenerator(subset='test', ids= X_testid, labels= y_test, **test_params)

### CNN without dropout

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization, Conv2D, MaxPooling2D
#from keras import regularizers, optimizers
import keras

model_without_dropout = Sequential()
#first block:
model_without_dropout.add(Conv2D(32, (7, 7), padding='valid', input_shape=(128, 431, 1), activation='relu')) #128, 431
model_without_dropout.add(BatchNormalization())
##model_without_dropout.add(Dropout(0.5))
model_without_dropout.add(MaxPooling2D(pool_size=(3, 3), strides=2, padding='same'))
#second block:
model_without_dropout.add(Conv2D(64, (5, 5), padding='same', activation='relu'))
model_without_dropout.add(BatchNormalization())
##model_without_dropout.add(Dropout(0.5))
model_without_dropout.add(MaxPooling2D(pool_size=(3, 3), strides=2, padding='same'))
#third block:
model_without_dropout.add(Conv2D(128, (3, 3), padding='same', activation='relu'))
model_without_dropout.add(BatchNormalization())
##model_without_dropout.add(Dropout(0.5))
model_without_dropout.add(MaxPooling2D(pool_size=(3, 3), strides=2, padding='same'))
#fourth block:
model_without_dropout.add(Conv2D(256, (3, 3), padding='same', activation='relu'))
model_without_dropout.add(BatchNormalization())
##model_without_dropout.add(Dropout(0.5))
model_without_dropout.add(MaxPooling2D(pool_size=(3, 3), strides=2, padding='same'))
#fifth block:
model_without_dropout.add(Conv2D(512, (3, 3), padding='same', activation='relu'))
model_without_dropout.add(BatchNormalization())
##model_without_dropout.add(Dropout(0.5))
model_without_dropout.add(MaxPooling2D(pool_size=(3, 3), strides=2, padding='same'))
#flatten layer:
model_without_dropout.add(Flatten())
model_without_dropout.add(BatchNormalization())
model_without_dropout.add(Dense(256, activation='relu'))
model_without_dropout.add(BatchNormalization())

model_without_dropout.add(Dropout(0.5))
model_without_dropout.add(Dense(3, activation='softmax'))

In [None]:
model_without_dropout.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001, decay=1e-6),
              loss=keras.losses.CategoricalCrossentropy(),
              metrics=["accuracy"])
model_without_dropout.summary()

In [None]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(DRIVE_PATH+"models/"+"paperCNN_withoutdropout_auto_after20.hdf5", #.{epoch:02d}-{val_loss:.4f}
                             save_weights_only=False,
                             monitor='val_accuracy',
                             verbose=0,
                             save_best_only=True,
                             mode='auto',
                             save_freq="epoch")

In [None]:
model_without_dropout.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=20,
    validation_data=valid_generator,
    validation_steps=len(valid_generator),
    callbacks=[checkpoint],
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7a1840cb1ff0>

In [None]:
model_without_dropout.save(DRIVE_PATH+"models/"+'without_dropout_20epoch_day2.h5')

In [None]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(DRIVE_PATH+"models/"+"paper_withoutdropout_auto_havedone20_{epoch:02d}_{val_loss:.4f}.hdf5",
                             save_weights_only=False,
                             monitor='val_accuracy',
                             verbose=0,
                             save_best_only=True,
                             mode='auto',
                             save_freq="epoch")

In [None]:
model_without_dropout.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=30,
    initial_epoch=20,
    validation_data=valid_generator,
    validation_steps=len(valid_generator),
    callbacks=[checkpoint],
)

Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7a1840e978e0>

In [None]:
model_without_dropout.save(DRIVE_PATH+"models/"+'without_dropout_30epoch_day2.h5')

In [None]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(DRIVE_PATH+"models/"+"paper_withoutdropout_auto_havedone30_{epoch:02d}_{val_loss:.4f}.hdf5",
                             save_weights_only=False,
                             monitor='val_accuracy',
                             verbose=0,
                             save_best_only=True,
                             mode='auto',
                             save_freq="epoch")

In [None]:
model_without_dropout.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=40,
    initial_epoch=30,
    validation_data=valid_generator,
    validation_steps=len(valid_generator),
    callbacks=[checkpoint],
)

Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7a1841571a50>

In [None]:
model_without_dropout.save(DRIVE_PATH+"models/"+'without_dropout_40epoch_day2.h5')

In [None]:
model_without_dropout.evaluate(test_generator, steps= len(test_generator))



[0.7510324120521545, 0.962890625]