## Data Preprocessing

In [1]:
# loading libraries
import librosa   #for audio processing
import librosa.display
import wave
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import matplotlib.pylab as plt
import seaborn as sns
import sys
import json
from scipy.io import wavfile #for audio processing
import warnings
warnings.filterwarnings("ignore")

In [2]:
# loading scripts
sys.path.insert(1, '../scripts')
sys.path.append("..")
sys.path.append(".")

from meta_cleaner import MetaCleaner
from audio_cleaner import AudioCleaner
from data_viz import Data_Viz

MC = MetaCleaner("../logs/preprocessing_notebook.log")
AC = AudioCleaner("../logs/preprocessing_notebook.log")
DV = Data_Viz()

## Processing Single Audio File

In [3]:
# load sample audio

sam, rat = AC.load_audio("../data/train/wav/tr_101_tr02001.wav")
audio = [sam, rat]
print("Sample: ", audio[0])
print("Rate: ", audio[1])

Sample:  [-0.00567627 -0.00564575 -0.00564575 ... -0.0057373  -0.0057373
 -0.00558472]
Rate:  16000


In [4]:
# changing channel

print("Before: ", audio[0].shape)
output = AC.change_to_stereo(audio)
print("After: ", output[0].shape)

Before:  (126976,)
After:  (2, 126976)


In [5]:
# changing sampling rate

print("Before: ", output[1])
output2 = AC.change_rate(output, 44100)
print("After: ", output2[1])
    

Before:  16000
After:  44100


In [6]:
# changing duration 

print("Before: ", librosa.get_duration(y=output2[0], sr=output2[1]))
output3 = AC.change_duration(output2, 8100)
print("After: ", librosa.get_duration(y=output3[0], sr=output3[1]))
    

Before:  7.9360090702947845
After:  8.081632653061224


In [7]:
# augumentation

output4 = AC.time_shift(output3)

## Processing in batch

### Metadata preparation

In [8]:
# creating meta data

path = "../data/train"
output = "../data/train_new"

meta_data = MC.generate_metadata(path, output)
meta_data.head()

Unnamed: 0,Target,Feature,Output
0,ያንደኛ ደረጃ ትምህርታቸው ን ጐንደር ተ ም ረዋል,../data/train/wav/tr_1_tr01001.wav,../data/train_new/tr_1_tr01001.wav
1,የተ ለቀቁት ምርኮኞች በ አካባቢያቸው ሰላማዊ ኑሮ እንዲ ኖሩ የ ትራንስፖ...,../data/train/wav/tr_2_tr01002.wav,../data/train_new/tr_2_tr01002.wav
2,በ አዲስ አበባው ስታዲየም በ ተካሄዱ ት ሁለት ግጥሚያ ዎች በ መጀመሪያ ...,../data/train/wav/tr_3_tr01003.wav,../data/train_new/tr_3_tr01003.wav
3,ወሬው ን ወሬ ያደረጉ ምስጢረ ኞች ናቸው,../data/train/wav/tr_4_tr01004.wav,../data/train_new/tr_4_tr01004.wav
4,ኢትዮጵያዊ ቷ በ ብሄራዊ ባህላዊ አለባበስ ከ አለም አንደኝነት ን ተቀዳጀ ች,../data/train/wav/tr_5_tr01005.wav,../data/train_new/tr_5_tr01005.wav


In [9]:
# adding duration column

MC.add_duration(meta_data)
selection = meta_data[meta_data["Duration"] != 400]
selection.head()

Unnamed: 0,Target,Feature,Output,Duration
9,ግን ወደ ኋላው ላይ ኢሳያስ እንደ ልማ ዳቸው ሁሉን ም የ መልከ ፍ ዲፕሎ...,../data/train/wav/tr_10_tr01010.wav,../data/train_new/tr_10_tr01010.wav,10.368
99,አለቃ የጻፏቸው መጽሀፍት ውድ ና ጣፋጭ ከ መሆናቸው የተነሳ በ ህትመታቸው...,../data/train/wav/tr_100_tr01100.wav,../data/train_new/tr_100_tr01100.wav,14.592
100,በ ኮምፒውተር ሳይንስ ፎን ት ቴክኖሎጂ ለ ዶክትሬት ዲግሪ ጥናት እያደረጉ...,../data/train/wav/tr_101_tr02001.wav,../data/train_new/tr_101_tr02001.wav,7.936
101,የ ውሀው ዘርፍ ያለበት ን የ ፋይናንስ ችግር ለ መፍታት የ ውሀ ሀብት ል...,../data/train/wav/tr_102_tr02002.wav,../data/train_new/tr_102_tr02002.wav,6.528
102,የ መንገደኞች ማስተናገጃ ህንጻው በ ሰአት እስከ ሶስት ሺ ያህል መንገደኞ...,../data/train/wav/tr_103_tr02003.wav,../data/train_new/tr_103_tr02003.wav,6.528


In [10]:
# building sklearn pipeline to process audio

pipe = AC.build_pipe([44100, 8100])
AC.batch_iterator(meta_data, pipe)    

Pipeline Ready!

Audio Processing Started...


## Final check

In [None]:
# making sure everything is right
data =  wave.open('../data/train_new/tr_10_tr01010.wav')
print("the parameters are: ", data.getparams())
meta_data= MC.add_duration(meta_data, output= True)
meta_data= MC.channel_count(meta_data, output= True)
selection = meta_data[meta_data["Duration"] != 400]
selection.head()

In [None]:
samples, sample_rate = librosa.load('../data/train_new/tr_10_tr01010.wav', sr=None)
DV.plot_spec(data=samples,sr=sample_rate)

## Data Splitting

In [None]:
# shuffling and splitting data

dfs = MC.split(meta_data, 80, 2)
train = dfs[0]
test = dfs[1]
print(train.shape)
print(test.shape)

### saving processed meta_data

In [None]:
# saving data as csv

MC.meta_saver(train, "../data/train_meta.csv", "csv")
MC.meta_saver(test, "../data/test_meta.csv", "csv")

In [None]:
# loading and checking CSV data

test_df = DC.meta_loader("../data/train_meta.csv", "csv")
test_df.head()