## Data Preprocessing

In [None]:
# loading libraries
import librosa   #for audio processing
import librosa.display
import wave
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import math
import matplotlib.pylab as plt
import seaborn as sns
import sys
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import json
from scipy.io import wavfile #for audio processing
import warnings
warnings.filterwarnings("ignore")

In [None]:
# loading scripts
sys.path.insert(1, '../scripts')
sys.path.append("..")
sys.path.append(".")

from meta_cleaner import MetaCleaner
from audio_cleaner import AudioCleaner
from data_viz import Data_Viz

MC = MetaCleaner("../logs/preprocessing_notebook.log")
AC = AudioCleaner("../logs/preprocessing_notebook.log")
DV = Data_Viz()

## Processing Single Audio File

In [None]:
# load sample audio

sam, rat = AC.load_audio("../data/train/wav/tr_101_tr02001.wav")
audio = [sam, rat]
print("Sample: ", audio[0])
print("Rate: ", audio[1])

In [None]:
# changing channel

print("Before: ", audio[0].shape)
output = AC.change_to_stereo(audio)
print("After: ", output[0].shape)

In [None]:
# changing sampling rate

print("Before: ", output[1])
output2 = AC.change_rate(output, 44100)
print("After: ", output2[1])
    

In [None]:
# changing duration 

print("Before: ", librosa.get_duration(y=output2[0], sr=output2[1]))
output3 = AC.change_duration(output2, 8100)
print("After: ", librosa.get_duration(y=output3[0], sr=output3[1]))
    

In [None]:
# augumentation

output4 = AC.time_shift(output3)

## Processing in batch

### Metadata preparation

In [None]:
# creating meta data

path = "../data/train"
output = "../data/train_new"

meta_data = MC.generate_metadata(path, output)
meta_data.head()

In [None]:
# adding duration column

MC.add_duration(meta_data)
selection = meta_data[meta_data["Duration"] != 400]
selection.head()

In [None]:
# Data processing pipeline

def build_pipe(ACl, param):
        """
        takes categorical column names: cat_list
        returns a pipe that process data for modeling
        """
        pipe = Pipeline(steps = [
                                ("make stereo", FunctionTransformer(ACl.change_to_stereo)),
                                ("change rate", FunctionTransformer(ACl.change_rate, kw_args={"sr":param[0]})), 
                                ("change length", FunctionTransformer(ACl.change_duration, kw_args={"max_ms":param[1]})),
                                ("augument", FunctionTransformer(ACl.time_shift))
                                ])
        return pipe

In [None]:
pipe = build_pipe(AC, [44100, 8100])
def batch_iterator(meta, pipe, ACl):
    
    for i in range(len(meta)):
        try:
            sam, rat = ACl.load_audio(meta.loc[i, "Feature"])
            audio = [sam, rat]
        except:
            continue
        
        p_audio = pipe.fit_transform(audio)
        ACl.save_audio(p_audio, meta.loc[i, "Output"])
        
    

## Final check

In [None]:
# making sure everything is right
data =  wave.open('../data/train_new/tr_10_tr01010.wav')
print("the parameters are: ", data.getparams())
meta_data= MC.add_duration(meta_data, output= True)
meta_data= MC.channel_count(meta_data, output= True)
selection = meta_data[meta_data["Duration"] != 400]
selection.head()

In [None]:
samples, sample_rate = librosa.load('../data/train_new/tr_10_tr01010.wav', sr=None)
DV.plot_spec(data=samples,sr=sample_rate)

## Data Splitting

In [None]:
# shuffling and splitting data

dfs = MC.split(meta_data, 80, 2)
train = dfs[0]
test = dfs[1]
print(train.shape)
print(test.shape)

### saving processed meta_data

In [None]:
# saving data as csv

DC.meta_saver(train, "../data/train_meta.csv", "csv")
DC.meta_saver(test, "../data/test_meta.csv", "csv")

In [None]:
# loading and checking json data

test_df = DC.meta_loader("../data/train_meta.csv", "csv")
test_df.head()