In [None]:
import numpy as np
import pandas as pd
import random as rn
import os

from scipy.io import wavfile
import librosa

import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
import librosa.display

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# reproducibility

def all_seed(seed_num):
    np.random.seed(seed_num)
    rn.seed(seed_num)
    os.environ['PYTHONHASHSEED']=str(seed_num)
    # tf.random.set_seed(seed_num)

seed_num = 42
all_seed(seed_num)

a_filename = './wav/Sess01_script01_User001F_001.wav'
samples, sample_rate = librosa.load(a_filename)

plt.figure(figsize=(10, 7))

# plt.plot(np.linspace(0, sample_rate/len(samples), len(samples)), samples)
librosa.display.waveshow(samples, sr=40000)

plt.xlabel('time', fontsize = 14)
plt.ylabel('amplitude', fontsize = 14)
plt.title('001.wav | Length : ' + str(len(samples)))

plt.show()

samples, sample_rate = librosa.load(a_filename)
X = librosa.stft(samples)  # data -> short term FT
Xdb = librosa.amplitude_to_db(abs(X))

plt.figure(figsize=(12, 3))
plt.title('001.wav spectrogram | Length : ' + str(len(samples)))
librosa.display.specshow(Xdb, sr = sample_rate, x_axis='time', y_axis='hz')   
plt.colorbar()
plt.show()

In [None]:

from sklearn.preprocessing import MinMaxScaler
def extract_features(sample):
    
    # MFCC
    mfcc =librosa.feature.mfcc(y = sample, sr = sample_rate,n_mfcc=50)
    padded_mfcc = pad2d(mfcc, 600) #padding
    scaler = MinMaxScaler(feature_range=(0,1)) ## 각 칼럼 데이터 값을 0~1 범위로 변환

    scaler.fit(padded_mfcc) ## 각 칼럼 데이터마다 변환할 함수 생성

    scaled_padded_mfcc = scaler.transform(padded_mfcc) ## fit에서 만들어진 함수를 실제로 데이터에 적용

    result = np.array([scaled_padded_mfcc])
    return result
pad2d = lambda a, i: a[:, 0:i] if a.shape[1] > i else np.hstack((a, np.zeros((a.shape[0], i-a.shape[1]))))



In [None]:

np.random.seed(14) 
def noise(sample, rate):
    noise_amp = rate*np.random.uniform()*np.amax(sample)
    sample = sample + noise_amp*np.random.normal(size = sample.shape[0])
    return sample

# time stretching
def stretch(sample, rate):
    stretch_sample = librosa.effects.time_stretch(y=sample, rate=rate)
    return stretch_sample


# pitch 변환
def pitch(sample, pitch_factor, sampling_rate=22050):
    pitch_sample = librosa.effects.pitch_shift(y=sample, sr=sampling_rate, n_steps=pitch_factor)
    return pitch_sample


def get_features(path,num):

    sample, sample_rate = librosa.load(path)
    
    # without augmentation
    res1 = extract_features(sample)
    result = np.array(res1)
    
    for i in range(num//2):
        # sample with noise
        rate = np.random.uniform(0,0.2)
        noise_sample = noise(sample,rate)
        res2 = extract_features(noise_sample)
        result = np.vstack((result, res2)) 
    
    for i in range(num//2):
        # sample with stretching and pitching
        rate = np.random.uniform(0.7, 0.9)
        pitch_rate = np.random.uniform(0.7,0.9)
        str_sample = stretch(sample,rate)
        sample_stretch_pitch = pitch(str_sample,pitch_rate)
        res3 = extract_features(sample_stretch_pitch)
        result = np.vstack((result, res3)) 

    return result

In [None]:
import pickle

with open("train.pkl","rb") as fr:
    data = pickle.load(fr)
data

In [None]:
data.columns

In [None]:
#from sklearn.model_selection import train_test_split
#x_train, x_valid, y_train, y_valid = train_test_split(train[['Segment ID','temp+eda','text_tokenize','mfcc_scaled']], data['sentiment_x'], test_size=0.2, shuffle=True, random_state=4)
x_train = data[['Segment ID','temp+eda','text_tokenize','mfcc_scaled']]
y_train = data['sentiment_x']

In [None]:
data['sentiment_x'].value_counts()

In [None]:
len(data[data['sentiment_x']==0]),len(data[data['sentiment_x']==1]),len(data[data['sentiment_x']==2]),len(data[data['sentiment_x']==3]),len(data[data['sentiment_x']==4]),len(data[data['sentiment_x']==5]),len(data[data['sentiment_x']==6])

In [None]:
#yes0
#no0
tmp = pd.concat([x_train,y_train], axis=1)
yes0 =tmp[tmp['sentiment_x']==0]
yes1 =tmp[tmp['sentiment_x']==1]
yes2 =tmp[tmp['sentiment_x']==2]
yes3 =tmp[tmp['sentiment_x']==3]
yes4 =tmp[tmp['sentiment_x']==4]
yes5 =tmp[tmp['sentiment_x']==5]
yes6 =tmp[tmp['sentiment_x']==6]


In [None]:
yes0 # augmentation X

In [None]:
y_train

# yes1

In [None]:
from tqdm import tqdm
labels = yes1['sentiment_x']
x, y = [], []
file_name = ['./wav/'+f+'.wav' for f in yes1['Segment ID']]
NUM=6
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM)
    #print(feature)
    for fe in feature:
        x.append(np.array(fe).transpose())

        y.append(label)

In [None]:
seg = [i for i in yes1['Segment ID'] for s in range(NUM+1)]
text = [i for i in yes1['text_tokenize'] for s in range(NUM+1)]
bio = [i for i in yes1['temp+eda'] for s in range(NUM+1)]

In [None]:

yes1df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes1df = pd.DataFrame(yes1df)

In [None]:
x[0].shape

In [None]:
yes1df

In [None]:
with open('augmentedyes1.pkl','wb') as fr:
    pickle.dump(yes1df, fr, pickle.HIGHEST_PROTOCOL)

# yes2

In [None]:
from tqdm import tqdm
labels = yes2['sentiment_x']
x, y = [], []
file_name = ['./wav/'+f+'.wav' for f in yes2['Segment ID']]
NUM=44
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM)
    #print(feature)
    for fe in feature:
        x.append(np.array(fe).transpose())

        y.append(label)
seg = [i for i in yes2['Segment ID'] for s in range(NUM+1)]
text = [i for i in yes2['text_tokenize'] for s in range(NUM+1)]
bio = [i for i in yes2['temp+eda'] for s in range(NUM+1)]


yes2df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes2df = pd.DataFrame(yes2df)

yes2df

In [None]:
with open('augmentedyes2.pkl','wb') as fr:
    pickle.dump(yes2df, fr, pickle.HIGHEST_PROTOCOL)

# yes3

In [None]:
print(len(x), len(y), len(seg), len(text), len(bio))
#yes5df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
len(yes3)*49

In [None]:
from tqdm import tqdm
labels = yes3['sentiment_x']
x, y = [], []
file_name = ['./wav/'+f+'.wav' for f in yes3['Segment ID']]
NUM=49
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM)
    #print(feature)
    for fe in feature:
        x.append(np.array(fe).transpose())
        y.append(label)


In [None]:
seg = [i for i in yes3['Segment ID'] for s in range(NUM)]
text = [i for i in yes3['text_tokenize'] for s in range(NUM)]
bio = [i for i in yes3['temp+eda'] for s in range(NUM)]


yes3df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes3df = pd.DataFrame(yes3df)

yes3df

In [None]:
with open('augmentedyes3.pkl','wb') as fr:
    pickle.dump(yes3df, fr, pickle.HIGHEST_PROTOCOL)

# yes4

In [None]:
from tqdm import tqdm
labels = yes4['sentiment_x']
x, y = [], []
file_name = ['./wav/'+f+'.wav' for f in yes4['Segment ID']]
NUM=57
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM)
    #print(feature)
    for fe in feature:
        x.append(np.array(fe).transpose())

        y.append(label)


In [None]:
seg = [i for i in yes4['Segment ID'] for s in range(NUM)]
text = [i for i in yes4['text_tokenize'] for s in range(NUM)]
bio = [i for i in yes4['temp+eda'] for s in range(NUM)]


yes4df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes4df = pd.DataFrame(yes4df)

yes4df

In [None]:
with open('augmentedyes4.pkl','wb') as fr:
    pickle.dump(yes4df, fr, pickle.HIGHEST_PROTOCOL)

# yes5

In [None]:
from tqdm import tqdm
labels = yes5['sentiment_x']
x, y = [], []
file_name = ['./wav/'+f+'.wav' for f in yes5['Segment ID']]
NUM=88
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM)
    #print(feature)
    for fe in feature:
        x.append(np.array(fe).transpose())

        y.append(label)


In [None]:
seg = [i for i in yes5['Segment ID'] for s in range(NUM+1)]
text = [i for i in yes5['text_tokenize'] for s in range(NUM+1)]
bio = [i for i in yes5['temp+eda'] for s in range(NUM+1)]


yes5df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes5df = pd.DataFrame(yes5df)

yes5df

In [None]:
with open('augmentedyes5.pkl','wb') as fr:
    pickle.dump(yes5df, fr, pickle.HIGHEST_PROTOCOL)

# yes6

In [None]:
from tqdm import tqdm
labels = yes6['sentiment_x']
x, y = [], []
file_name = ['./wav/'+f+'.wav' for f in yes6['Segment ID']]
NUM=202
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM)
    #print(feature)
    for fe in feature:
        x.append(np.array(fe).transpose())

        y.append(label)
seg = [i for i in yes6['Segment ID'] for s in range(NUM+1)]
text = [i for i in yes6['text_tokenize'] for s in range(NUM+1)]
bio = [i for i in yes6['temp+eda'] for s in range(NUM+1)]


yes6df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes6df = pd.DataFrame(yes6df)

yes6df

In [None]:
with open('augmentedyes6.pkl','wb') as fr:
    pickle.dump(yes6df, fr, pickle.HIGHEST_PROTOCOL)

In [None]:
DF = pd.concat((yes0,yes1df,yes2df,yes3df,yes4df,yes5df,yes6df),axis=0)

In [None]:
DF['sentiment_x'].value_counts()

In [None]:
with open('augmented2.pkl','wb') as fr:
    pickle.dump(DF, fr, pickle.HIGHEST_PROTOCOL)

In [None]:
DF = DF.sample(frac=1).reset_index(drop=True)

with open('augmentedhalf1.pkl','wb') as fr:
    pickle.dump(DF.iloc[:len(DF)//2], fr, pickle.HIGHEST_PROTOCOL)
with open('augmentedhalf2.pkl','wb') as fr:
    pickle.dump(DF.iloc[len(DF)//2:], fr, pickle.HIGHEST_PROTOCOL)
