## Data Preprocessing

In [10]:
# loading libraries
import librosa   #for audio processing
import librosa.display
import wave
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import sys
from scipy.io import wavfile #for audio processing
import warnings
warnings.filterwarnings("ignore")

In [11]:
# loading scripts
sys.path.insert(1, '../scripts')
sys.path.append("..")
sys.path.append(".")

from data_cleaning import DataCleaner

DC = DataCleaner("../logs/preprocessing_notebook.log")

## Load metadata

In [12]:
# loading meta data
path = "../data/train"
output = "../data/train_new"

meta_data = DC.generate_metadata(path, output)
meta_data.head()


Unnamed: 0,Target,Feature,Output
0,ያንደኛ ደረጃ ትምህርታቸው ን ጐንደር ተ ም ረዋል,../data/train/wav/tr_1_tr01001.wav,../data/train_new/tr_1_tr01001.wav
1,የተ ለቀቁት ምርኮኞች በ አካባቢያቸው ሰላማዊ ኑሮ እንዲ ኖሩ የ ትራንስፖ...,../data/train/wav/tr_2_tr01002.wav,../data/train_new/tr_2_tr01002.wav
2,በ አዲስ አበባው ስታዲየም በ ተካሄዱ ት ሁለት ግጥሚያ ዎች በ መጀመሪያ ...,../data/train/wav/tr_3_tr01003.wav,../data/train_new/tr_3_tr01003.wav
3,ወሬው ን ወሬ ያደረጉ ምስጢረ ኞች ናቸው,../data/train/wav/tr_4_tr01004.wav,../data/train_new/tr_4_tr01004.wav
4,ኢትዮጵያዊ ቷ በ ብሄራዊ ባህላዊ አለባበስ ከ አለም አንደኝነት ን ተቀዳጀ ች,../data/train/wav/tr_5_tr01005.wav,../data/train_new/tr_5_tr01005.wav


## Convert Mono to Stereo

In [13]:
# adding new column for channels count

meta_data= DC.channel_count(meta_data)

In [14]:
# checking number of channels

meta_data["n_channel"].value_counts()

400    9878
1       997
Name: n_channel, dtype: int64

**Observation**
- All the loadded audio samples are mono
- 400 is just a placeholder for missing audio files 
  because only a limited number of samples are taken for development
  This will not be present when the data is complete.

In [15]:
# Converting to stereo

DC.make_stereo(meta_data)

In [16]:
# checking number of channels again

meta_data= DC.channel_count(meta_data, True)
meta_data["n_channel"].value_counts()

400    9878
2       997
Name: n_channel, dtype: int64

**Observation**
- audio samples have been successfully converted to stereo
- 400 is just a placeholder for missing audio files 
  because only a limited number of samples are taken for development
  This will not be present when the data is complete.

## Standardize Sampling Rate

In [17]:
DC.standardize(meta_data)

In [18]:
data =  wave.open('../data/train_new/tr_10_tr01010.wav')
data.getparams()

_wave_params(nchannels=1, sampwidth=2, framerate=44100, nframes=0, comptype='NONE', compname='not compressed')

**Observation**
- sample rates have been sucessfully standardized
- framrate = 44100
