## Data Preprocessing

In [2]:
# loading libraries
import librosa   #for audio processing
import librosa.display
import wave
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import sys
from scipy.io import wavfile #for audio processing
import warnings
warnings.filterwarnings("ignore")

In [3]:
# loading scripts
sys.path.insert(1, '../scripts')
sys.path.append("..")
sys.path.append(".")

from data_cleaning import DataCleaner
from data_viz import Data_Viz

DC = DataCleaner("../logs/preprocessing_notebook.log")
DV = Data_Viz()

## Load metadata

In [4]:
# loading meta data
path = "../data/train"
output = "../data/train_new"

meta_data = DC.generate_metadata(path, output)
meta_data.head()


Unnamed: 0,Target,Feature,Output
0,ያንደኛ ደረጃ ትምህርታቸው ን ጐንደር ተ ም ረዋል,../data/train/wav/tr_1_tr01001.wav,../data/train_new/tr_1_tr01001.wav
1,የተ ለቀቁት ምርኮኞች በ አካባቢያቸው ሰላማዊ ኑሮ እንዲ ኖሩ የ ትራንስፖ...,../data/train/wav/tr_2_tr01002.wav,../data/train_new/tr_2_tr01002.wav
2,በ አዲስ አበባው ስታዲየም በ ተካሄዱ ት ሁለት ግጥሚያ ዎች በ መጀመሪያ ...,../data/train/wav/tr_3_tr01003.wav,../data/train_new/tr_3_tr01003.wav
3,ወሬው ን ወሬ ያደረጉ ምስጢረ ኞች ናቸው,../data/train/wav/tr_4_tr01004.wav,../data/train_new/tr_4_tr01004.wav
4,ኢትዮጵያዊ ቷ በ ብሄራዊ ባህላዊ አለባበስ ከ አለም አንደኝነት ን ተቀዳጀ ች,../data/train/wav/tr_5_tr01005.wav,../data/train_new/tr_5_tr01005.wav


In [5]:
# adding duration column

DC.add_duration(meta_data)
selection = meta_data[meta_data["Duration"] != 400]
selection.head()

Unnamed: 0,Target,Feature,Output,Duration
9,ግን ወደ ኋላው ላይ ኢሳያስ እንደ ልማ ዳቸው ሁሉን ም የ መልከ ፍ ዲፕሎ...,../data/train/wav/tr_10_tr01010.wav,../data/train_new/tr_10_tr01010.wav,10.368
99,አለቃ የጻፏቸው መጽሀፍት ውድ ና ጣፋጭ ከ መሆናቸው የተነሳ በ ህትመታቸው...,../data/train/wav/tr_100_tr01100.wav,../data/train_new/tr_100_tr01100.wav,14.592
100,በ ኮምፒውተር ሳይንስ ፎን ት ቴክኖሎጂ ለ ዶክትሬት ዲግሪ ጥናት እያደረጉ...,../data/train/wav/tr_101_tr02001.wav,../data/train_new/tr_101_tr02001.wav,7.936
101,የ ውሀው ዘርፍ ያለበት ን የ ፋይናንስ ችግር ለ መፍታት የ ውሀ ሀብት ል...,../data/train/wav/tr_102_tr02002.wav,../data/train_new/tr_102_tr02002.wav,6.528
102,የ መንገደኞች ማስተናገጃ ህንጻው በ ሰአት እስከ ሶስት ሺ ያህል መንገደኞ...,../data/train/wav/tr_103_tr02003.wav,../data/train_new/tr_103_tr02003.wav,6.528


## Spectogramm

In [6]:
samples, sample_rate = librosa.load('../data/train_new/tr_10_tr01010.wav', sr=None)
X = librosa.stft(samples)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sample_rate, x_axis='time', y_axis='hz')
plt.colorbar()

FileNotFoundError: [Errno 2] No such file or directory: '../data/train_new/tr_10_tr01010.wav'

## Standardize Sampling Rate

In [None]:
# checking rate before standardizing

data =  wave.open('../data/train/wav/tr_10_tr01010.wav')
data.getparams()

In [None]:
DV.visualize('../data/train/wav/tr_10_tr01010.wav')

In [None]:
DC.standardize(meta_data)

In [None]:
# checking rate after standardizing 
data =  wave.open('../data/train_new/tr_10_tr01010.wav')
data.getparams()

In [None]:
DV.visualize('../data/train_new/tr_10_tr01010.wav')

**Observation**
- sample rates have been sucessfully standardized
- framrate = 44100


## Resizing Audio Samples

In [None]:
# checking duration before resizing

meta_data= DC.add_duration(meta_data, output= True)
selection = meta_data[meta_data["Duration"] != 400]
selection.head()

In [None]:
DC.resize_pad_trunc(meta_data, 5000)

In [None]:
# checking truncated result
meta_data= DC.add_duration(meta_data, output= True)
selection = meta_data[meta_data["Duration"] != 400]
selection.head()

## Convert Mono to Stereo

In [None]:
# adding new column for channels count

meta_data= DC.channel_count(meta_data)
selection = meta_data[meta_data["Duration"] != 400]
selection.head()

In [None]:
# checking number of channels

meta_data["n_channel"].value_counts()

**Observation**
- All the loadded audio samples are mono
- 400 is just a placeholder for missing audio files 
  because only a limited number of samples are taken for development
  This will not be present when the data is complete.

In [None]:
# Converting to stereo

DC.make_stereo(meta_data, True)

In [None]:
# making sure everything is right
data =  wave.open('../data/train_new/tr_10_tr01010.wav')
print("the parameters are: ", data.getparams())
meta_data= DC.add_duration(meta_data, output= True)
meta_data= DC.channel_count(meta_data, output= True)
selection = meta_data[meta_data["Duration"] != 400]
selection.head()


In [None]:
DV.visualize('../data/train_new/tr_10_tr01010.wav')

**Observation**
- audio samples have been successfully converted to stereo
- 400 is just a placeholder for missing audio files 
  because only a limited number of samples are taken for development
  This will not be present when the data is complete.

## Data Augumentation
### Time Shift

DC.time_shift(meta_data, int(sample_rate/10), True)

In [None]:
# making sure everything is right
data =  wave.open('../data/train_new/tr_10_tr01010.wav')
print("the parameters are: ", data.getparams())
meta_data= DC.add_duration(meta_data, output= True)
meta_data= DC.channel_count(meta_data, output= True)
selection = meta_data[meta_data["Duration"] != 400]
selection.head()

In [None]:
DV.visualize('../data/train_new/tr_10_tr01010.wav')

In [None]:
samples, sample_rate = librosa.load('../data/train_new/tr_10_tr01010.wav', sr=None)
DV.plot_spec(data=samples,sr=sample_rate)

## Feature Extraction

Here we will be using Mel-Frequency Cepstral Coefficients(MFCC) from the audio samples. The MFCC summarises the frequency distribution across the window size, so it is possible to analyse both the frequency and time characteristics of the sound. These audio representations will allow us to identify features for classification.

In [None]:
features_df = DC.total_feature_extractor(meta_data)

In [None]:
features_df.head()

In [None]:

samples, sample_rate = librosa.load('../data/train_new/tr_10_tr01010.wav', sr=None)
# MFCC
plt.figure(figsize=(20,5))
mfccs = librosa.feature.mfcc(samples, sr=sample_rate)
print(mfccs.shape)

librosa.display.specshow(mfccs, sr=sample_rate, x_axis='time')

### saving processed data and meta_data

In [None]:
# saving data 

features_df.to_csv("../data/data.csv")
meta_data.to_csv("../data/meta_data.csv")
