In [10]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import dot
from numpy.linalg import norm
import seaborn as sns
import matplotlib.pyplot as plt
import urllib.request

from sentence_transformers import SentenceTransformer

import librosa
import librosa.display
from IPython.display import Audio
from tqdm import tqdm

In [37]:
current_path=os.getcwd().replace("\\", "/").replace("c:", "C:")
data_path=current_path+'/data/'

In [39]:
txt_data = pd.read_csv(data_path+"/5차_10011.csv", encoding= 'CP949')

In [43]:
txt_data['1번 감정'] = txt_data['1번 감정'].apply(str.lower)
txt_data['2번 감정'] = txt_data['2번 감정'].apply(str.lower)
txt_data['3번 감정'] = txt_data['3번 감정'].apply(str.lower)
txt_data['4번 감정'] = txt_data['4번 감정'].apply(str.lower)
txt_data['5번 감정'] = txt_data['5번 감정'].apply(str.lower)

In [44]:
def get_keys(dic): #returns a key for max values in dic
  key_list = list(dic.keys())
  val_list = list(dic.values())
  pos = val_list.index(max(val_list))
  return key_list[pos]

final_label = []
for i in range(len(txt_data)):
  sentiments = {'angry':0, 'sadness':0, 'happiness':0, 'fear': 0, 'disgust':0, 'surprise':0, 'neutral':0}
  sentiments[txt_data.iloc[i]['1번 감정']] += txt_data.iloc[i]['1번 감정세기']
  sentiments[txt_data.iloc[i]['2번 감정']] += txt_data.iloc[i]['2번 감정세기']
  sentiments[txt_data.iloc[i]['3번 감정']] += txt_data.iloc[i]['3번 감정세기']
  sentiments[txt_data.iloc[i]['4번 감정']] += txt_data.iloc[i]['4번감정세기']
  sentiments[txt_data.iloc[i]['5번 감정']] += txt_data.iloc[i]['5번 감정세기']

  final_label.append(get_keys(sentiments))

final_label_df = pd.DataFrame(final_label, columns=['final_label'])
new_txt_data = pd.concat([txt_data[['wav_id', '발화문']], final_label_df], axis = 1)

In [47]:
audio_path = data_path+'5차_wav'
wav_list = os.listdir(audio_path)
wav_list_tmp = random.sample(wav_list, len(wav_list))

In [48]:
wav_list_tmp_id = [] #wav 파일명에서 .wav 제외하고 순수한 wav_id를 추출
for i in range(len(wav_list)):
    wav_list_tmp_id.append(wav_list_tmp[i][:-4])

wav_list_tmp_label, wav_list_tmp_sentence = [], []
for x in wav_list_tmp_id:
  wav_list_tmp_label.append(new_txt_data[new_txt_data['wav_id'] == x]['final_label'].values[0])
  wav_list_tmp_sentence.append(new_txt_data[new_txt_data['wav_id'] == x]['발화문'].values[0])

wav_df = pd.DataFrame(
    {'wav_id': wav_list_tmp_id,
     'final_label': wav_list_tmp_label,
     'sentence': wav_list_tmp_sentence
    })

In [49]:
sad=wav_df[wav_df['final_label']=='sadness']
angry=wav_df[wav_df['final_label']=='angry']

sad=sad.sample(600)
angry=angry.sample(600)

else_df=wav_df[(wav_df['final_label']!='sadness')&(wav_df['final_label']!='angry')]

wav_df=pd.concat([angry, sad, else_df])

In [50]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate=0.8)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate):
    return librosa.effects.pitch_shift(data,sr=sampling_rate, n_steps=1)

In [51]:
def extract_features(data, sample_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally

    return result

In [52]:
def get_features(path):

    data, sample_rate = librosa.load(path, duration=2.5, offset=0.0)

    # without augmentation
    res1 = extract_features(data, sample_rate)
    result = np.array(res1)

    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data, sample_rate)
    result = np.concatenate((result, res2), axis = 0)

    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch, sample_rate)
    result = np.concatenate((result, res3), axis = 0)

    return result

In [54]:
X_audio, Y = [], []
for path, label in tqdm(zip(wav_df['wav_id'], wav_df['final_label']), total=len(wav_df)):
    audio_features = get_features(audio_path+'/'+path+'.wav')
    X_audio.append(audio_features)
    Y.append(label)

audio_features = pd.DataFrame(X_audio)

100%|██████████| 2614/2614 [04:18<00:00, 10.11it/s]


In [55]:
for i in audio_features.columns:
  if audio_features[i].isnull().sum()!=0:
    print(i)

In [56]:
audio_features.reset_index(inplace=True, drop=True)
wav_df.reset_index(inplace=True, drop=True)

In [57]:
final_df=pd.concat([audio_features, wav_df], axis=1)
final_df.to_csv(data_path+'final_df.csv', index=False)