In [2]:
import numpy as np
import pandas as pd
import random as rn
import os

from scipy.io import wavfile
import librosa

import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa.display

import warnings
warnings.filterwarnings("ignore")

In [None]:
import pickle
with open('train.pkl',"rb") as fr:
    data = pickle.load(fr)
data

In [76]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(data[['Segment ID','temp+eda','text_tokenize','mfcc_scaled']], data['sentiment_x'], test_size=0.125, shuffle=True, random_state=4)

In [77]:
pad2d = lambda a, i: a[:, 0:i] if a.shape[1] > i else np.hstack((a, np.zeros((a.shape[0], i-a.shape[1]))))

In [78]:
from sklearn.preprocessing import MinMaxScaler
def extract_features(sample):
    
    # MFCC
    mfcc =librosa.feature.mfcc(y = sample, sr = 16000 ,n_mfcc=50)
    padded_mfcc = pad2d(mfcc, 600) #padding
    scaler = MinMaxScaler(feature_range=(0,1)) ## 각 칼럼 데이터 값을 0~1 범위로 변환

    scaler.fit(padded_mfcc) ## 각 칼럼 데이터마다 변환할 함수 생성

    scaled_padded_mfcc = scaler.transform(padded_mfcc) ## fit에서 만들어진 함수를 실제로 데이터에 적용

    result = np.array([scaled_padded_mfcc])
    return result


In [79]:

np.random.seed(14) 
def noise(sample, rate):
    noise_amp = rate*np.random.uniform()*np.amax(sample)
    sample = sample + noise_amp*np.random.normal(size = sample.shape[0])
    return sample

In [152]:
def get_features(path,num):

    sample, sample_rate = librosa.load(path)
    
    rate = np.random.uniform(0,0.2)
    noise_sample = noise(sample,rate)
    res1 = extract_features(noise_sample)
    result = np.array(res1)

    for i in range(num-1):
        # sample with noise
        rate = np.random.uniform(0,0.2)
        noise_sample = noise(sample,rate)
        res1 = extract_features(noise_sample)
        result = np.vstack((result, res1)) 
    
    for i in range(num):
        y_pitch = sample.copy()
        bins_per_octave = 12
        pitch_pm = 2
        pitch_change =  pitch_pm * 2*(np.random.uniform())   
        y_pitch = librosa.effects.pitch_shift(y_pitch.astype('float64'), sr=16000, n_steps=pitch_change, bins_per_octave=bins_per_octave)
        y_pitch = extract_features(y_pitch)
        result = np.vstack((result, y_pitch)) 

    for i in range(num):
        y_aug = sample.copy()
        dyn_change = np.random.uniform(low=1.5,high=3)
        y_aug = y_aug * dyn_change
        y_aug = extract_features(y_aug)
        result = np.vstack((result, y_aug)) 

    return result

In [46]:
a=get_features('waaav/Sess01_script01_User001F_001.wav',2)

In [51]:
arr=[]
for i in a:
    arr.append(np.array(i).transpose())

In [60]:
seg = [0] in range(7)
text = [0] in range(7)
bio = [0] in range(7)

In [61]:
yes1df={'Segment ID': seg,'mfcc_scaled':arr,'text_tokenize':text, 'temp+eda':bio}
yes1df = pd.DataFrame(yes1df)

In [64]:
yes1df.drop_duplicates(subset=['mfcc_scaled'])

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda
0,False,"[[0.0, 1.0, 0.7907979727231563, 0.888733543697...",False,False
1,False,"[[0.0, 0.9974658736370667, 0.9825420774811447,...",False,False
2,False,"[[0.0, 1.0, 0.9689731006542772, 0.964144349991...",False,False
3,False,"[[0.0, 1.0, 0.7816415952209347, 0.892297778614...",False,False
4,False,"[[0.0, 1.0, 0.765783977333432, 0.8936367316178...",False,False
5,False,"[[0.0, 1.0, 0.753455247451137, 0.8688724099502...",False,False
6,False,"[[0.0, 1.0, 0.7669277774677178, 0.876037919110...",False,False


In [70]:

y_train.value_counts()

0    7393
1     934
2     154
3     142
4     123
5      79
6      35
Name: sentiment_x, dtype: int64

In [81]:
#yes0
#no0
tmp = pd.concat([x_train,y_train], axis=1)
yes1 =tmp[tmp['sentiment_x']==1]
yes2 =tmp[tmp['sentiment_x']==2]
yes3 =tmp[tmp['sentiment_x']==3]
yes4 =tmp[tmp['sentiment_x']==4]
yes5 =tmp[tmp['sentiment_x']==5]
yes6 =tmp[tmp['sentiment_x']==6]

In [82]:
yes1

Unnamed: 0,Segment ID,temp+eda,text_tokenize,mfcc_scaled,sentiment_x
3210,Sess10_script03_User020M_032,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[259, 392, 22, 340, 3308, 129, 57, 463, 340, 7...","[[0.06931926312442405, 0.564872953927398, 0.74...",1
243,Sess01_script05_User002M_034,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[84, 160, 49, 510, 79, 61, 7057, 1417, 505, 17...","[[0.10744950158776012, 0.4078630854532514, 0.4...",1
3055,Sess10_script01_User019F_007,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[15, 350, 15, 1745, 51, 272, 192, 1, 18, 8768,...","[[0.0, 0.5341681653552921, 0.9298299200288298,...",1
3625,Sess11_script06_User021M_007,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[336, 87, 87, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[[0.0, 0.5387102744512207, 0.6856873128326555,...",1
3056,Sess10_script01_User019F_008,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[350, 15, 1745, 4087, 58, 8769, 0, 0, 0, 0, 0,...","[[0.579841260582342, 0.3879373882837065, 0.218...",1
...,...,...,...,...,...
748,Sess03_script03_User005M_033,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[1992, 12, 1557, 7423, 23, 850, 0, 0, 0, 0, 0,...","[[0.0, 0.5491286008737722, 0.6548360528751691,...",1
7868,Sess27_script01_User053M_007,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[88, 14, 105, 7, 14, 128, 107, 11969, 107, 119...","[[0.2207304573159644, 0.5337093280651524, 0.35...",1
7354,Sess25_script03_User050F_031,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[15, 276, 5, 45, 51, 11711, 0, 0, 0, 0, 0, 0, ...","[[0.0, 0.5463543612802085, 0.5563269858524766,...",1
1080,Sess04_script03_User008F_027,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[3, 626, 1, 33, 1459, 777, 352, 1459, 777, 352...","[[0.048905902886572816, 0.4750492055078576, 0....",1


In [85]:
from tqdm import tqdm
labels = yes1['sentiment_x']
x, y = [], []
file_name = ['./waaav/'+f+'.wav' for f in yes1['Segment ID']]
NUM=1
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM)
    #print(feature)
    for fe in feature:
        x.append(np.array(fe).transpose())

        y.append(label)

934it [05:28,  2.84it/s]


In [86]:
len(x)

2802

In [88]:
seg = [i for i in yes1['Segment ID'] for s in range(3*NUM)]
text = [i for i in yes1['text_tokenize'] for s in range(3*NUM)]
bio = [i for i in yes1['temp+eda'] for s in range(3*NUM)]

In [108]:
yes1df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes1df = pd.DataFrame(yes1df)

In [102]:
import random
random.sample(range(2082),10)

[1198, 533, 1946, 1263, 1449, 1678, 1503, 459, 230, 1056]

In [110]:
yes1df.drop(random.sample(range(2082),236), axis=0, inplace=True)
yes1df

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
1,Sess10_script03_User020M_032,"[[0.0, 1.0, 0.8744735104779164, 0.886617451014...","[259, 392, 22, 340, 3308, 129, 57, 463, 340, 7...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
2,Sess10_script03_User020M_032,"[[0.0, 1.0, 0.8586356206702327, 0.871486027462...","[259, 392, 22, 340, 3308, 129, 57, 463, 340, 7...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
4,Sess01_script05_User002M_034,"[[0.0, 0.9999999999999999, 0.8333884540213234,...","[84, 160, 49, 510, 79, 61, 7057, 1417, 505, 17...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
5,Sess01_script05_User002M_034,"[[0.0, 0.9999999999999999, 0.8155734903959928,...","[84, 160, 49, 510, 79, 61, 7057, 1417, 505, 17...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
6,Sess10_script01_User019F_007,"[[0.0, 0.9698028214197132, 0.9830630593254768,...","[15, 350, 15, 1745, 51, 272, 192, 1, 18, 8768,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
...,...,...,...,...,...
2797,Sess04_script03_User008F_027,"[[0.0, 1.0, 0.8335997686841178, 0.860999485309...","[3, 626, 1, 33, 1459, 777, 352, 1459, 777, 352...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
2798,Sess04_script03_User008F_027,"[[0.0, 1.0, 0.8370398879051208, 0.822929441928...","[3, 626, 1, 33, 1459, 777, 352, 1459, 777, 352...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
2799,Sess14_script01_User027F_013,"[[0.0, 0.9999999999999999, 0.9889627672155138,...","[1629, 1366, 2773, 43, 4, 106, 0, 0, 0, 0, 0, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
2800,Sess14_script01_User027F_013,"[[0.0, 1.0, 0.8615710092461866, 0.868354542207...","[1629, 1366, 2773, 43, 4, 106, 0, 0, 0, 0, 0, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1


In [114]:
dddd=pd.concat([yes1df,yes1]).reset_index()
with open('yes.pkl', 'wb') as f:
    pickle.dump(dddd, f)

# 2번 감정

In [150]:
yes2=yes2.reset_index()
yes2

Unnamed: 0,level_0,index,Segment ID,temp+eda,text_tokenize,mfcc_scaled,sentiment_x
0,0,772,Sess03_script04_User005M_008,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[34, 72, 58, 270, 6, 18, 3079, 23, 0, 0, 0, 0,...","[[0.0, 0.19094903516377562, 0.6286003123403705...",2
1,1,6544,Sess22_script06_User044F_018,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[68, 6238, 4454, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0.030517322402882407, 0.576732543580456, 0.7...",2
2,2,4528,Sess15_script02_User030M_024,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[107, 405, 101, 409, 405, 101, 341, 90, 82, 21...","[[0.41263037559718563, 0.503056404496469, 0.68...",2
3,3,6328,Sess22_script02_User043F_009,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[28, 804, 136, 115, 16, 28, 103, 36, 892, 6, 1...","[[0.14576691, 0.53962094, 0.80354065, 0.606725...",2
4,4,646,Sess03_script02_User006M_011,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[53, 1029, 591, 410, 958, 18, 1242, 30, 921, 2...","[[0.0, 0.3838363710606158, 0.6329255345492237,...",2
...,...,...,...,...,...,...,...
149,149,6537,Sess22_script06_User043F_013,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[94, 643, 1, 187, 24, 16, 228, 24, 16, 48, 55,...","[[0.45135928928012914, 0.3414299277941974, 0.4...",2
150,150,6033,Sess21_script02_User041F_010,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[1, 5711, 164, 128, 3, 10881, 241, 2, 441, 128...","[[0.0, 0.4925207, 0.65072703, 0.3801505, 0.879...",2
151,151,10690,Sess35_script02_User069F_032,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[23, 1139, 2, 1171, 24, 24, 16, 9, 709, 3319, ...","[[0.006966984746885063, 0.5819747199762585, 0....",2
152,152,6823,Sess23_script06_User045F_003,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[232, 8, 6268, 37, 1174, 26, 412, 2936, 11384,...","[[0.0, 0.5936466183144044, 0.7385034431700166,...",2


In [153]:
from tqdm import tqdm
labels = yes2['sentiment_x']
x, y = [], []
file_name = ['./waaav/'+f+'.wav' for f in yes2['Segment ID']]
NUM=8
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM) #한번에 3세트씩 8번 반복해라
    for fe in feature:
        x.append(np.array(fe).transpose())
        y.append(label)

154it [07:58,  3.11s/it]


In [154]:
seg = [i for i in yes2['Segment ID'] for s in range(3*NUM)]
text = [i for i in yes2['text_tokenize'] for s in range(3*NUM)]
bio = [i for i in yes2['temp+eda'] for s in range(3*NUM)]

In [155]:
len(x)

3696

In [156]:


yes2df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes2df = pd.DataFrame(yes2df)

yes2df

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,Sess03_script04_User005M_008,"[[0.0, 0.9623420288782376, 0.9798655732385638,...","[34, 72, 58, 270, 6, 18, 3079, 23, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
1,Sess03_script04_User005M_008,"[[0.0, 0.566370758745155, 0.7517337546794787, ...","[34, 72, 58, 270, 6, 18, 3079, 23, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
2,Sess03_script04_User005M_008,"[[0.0, 1.0, 0.9959468491958459, 0.990208679033...","[34, 72, 58, 270, 6, 18, 3079, 23, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
3,Sess03_script04_User005M_008,"[[0.0, 0.938911066811685, 0.9605287370609593, ...","[34, 72, 58, 270, 6, 18, 3079, 23, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
4,Sess03_script04_User005M_008,"[[0.0, 0.7321576496843174, 0.8210471791588223,...","[34, 72, 58, 270, 6, 18, 3079, 23, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
...,...,...,...,...,...
3691,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.8472170003843945, 0.891401194892...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
3692,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.8427829699385182, 0.888249465447...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
3693,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.847416472603499, 0.8915429893159...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
3694,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.841298580466981, 0.8871943493948...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2


In [157]:
print(len(yes2),len(yes2df))

154 3696


In [158]:
yes2df.drop(random.sample(range(3695),350), axis=0, inplace=True)
yes2df

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,Sess03_script04_User005M_008,"[[0.0, 0.9623420288782376, 0.9798655732385638,...","[34, 72, 58, 270, 6, 18, 3079, 23, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
1,Sess03_script04_User005M_008,"[[0.0, 0.566370758745155, 0.7517337546794787, ...","[34, 72, 58, 270, 6, 18, 3079, 23, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
2,Sess03_script04_User005M_008,"[[0.0, 1.0, 0.9959468491958459, 0.990208679033...","[34, 72, 58, 270, 6, 18, 3079, 23, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
3,Sess03_script04_User005M_008,"[[0.0, 0.938911066811685, 0.9605287370609593, ...","[34, 72, 58, 270, 6, 18, 3079, 23, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
4,Sess03_script04_User005M_008,"[[0.0, 0.7321576496843174, 0.8210471791588223,...","[34, 72, 58, 270, 6, 18, 3079, 23, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
...,...,...,...,...,...
3691,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.8472170003843945, 0.891401194892...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
3692,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.8427829699385182, 0.888249465447...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
3693,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.847416472603499, 0.8915429893159...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
3694,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.841298580466981, 0.8871943493948...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2


In [166]:
dddd #dddd=dddd.drop(['index'],axis=1)

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,Sess10_script03_User020M_032,"[[0.0, 1.0, 0.8744735104779164, 0.886617451014...","[259, 392, 22, 340, 3308, 129, 57, 463, 340, 7...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
1,Sess10_script03_User020M_032,"[[0.0, 1.0, 0.8586356206702327, 0.871486027462...","[259, 392, 22, 340, 3308, 129, 57, 463, 340, 7...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
2,Sess01_script05_User002M_034,"[[0.0, 0.9999999999999999, 0.8333884540213234,...","[84, 160, 49, 510, 79, 61, 7057, 1417, 505, 17...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
3,Sess01_script05_User002M_034,"[[0.0, 0.9999999999999999, 0.8155734903959928,...","[84, 160, 49, 510, 79, 61, 7057, 1417, 505, 17...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
4,Sess10_script01_User019F_007,"[[0.0, 0.9698028214197132, 0.9830630593254768,...","[15, 350, 15, 1745, 51, 272, 192, 1, 18, 8768,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
...,...,...,...,...,...
3495,Sess03_script03_User005M_033,"[[0.0, 0.5491286008737722, 0.6548360528751691,...","[1992, 12, 1557, 7423, 23, 850, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
3496,Sess27_script01_User053M_007,"[[0.2207304573159644, 0.5337093280651524, 0.35...","[88, 14, 105, 7, 14, 128, 107, 11969, 107, 119...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
3497,Sess25_script03_User050F_031,"[[0.0, 0.5463543612802085, 0.5563269858524766,...","[15, 276, 5, 45, 51, 11711, 0, 0, 0, 0, 0, 0, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1
3498,Sess04_script03_User008F_027,"[[0.048905902886572816, 0.4750492055078576, 0....","[3, 626, 1, 33, 1459, 777, 352, 1459, 777, 352...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",1


In [171]:
dddd=pd.concat([yes2,dddd,yes2df])
dddd=dddd.drop(['index','level_0'],axis=1)
with open('yes.pkl', 'wb') as f:
    pickle.dump(dddd, f)


# 3번 감정

In [172]:
from tqdm import tqdm
labels = yes3['sentiment_x']
x, y = [], []
file_name = ['./waaav/'+f+'.wav' for f in yes3['Segment ID']]
NUM=8
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM)
    for fe in feature:
        x.append(np.array(fe).transpose())
        y.append(label)

142it [05:48,  2.45s/it]


In [177]:
len(seg)

3408

In [176]:
seg = [i for i in yes3['Segment ID'] for s in range(NUM*3)]
text = [i for i in yes3['text_tokenize'] for s in range(NUM*3)]
bio = [i for i in yes3['temp+eda'] for s in range(NUM*3)]

In [178]:
yes3df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes3df = pd.DataFrame(yes3df)

yes3df

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,Sess16_script03_User031M_043,"[[0.0, 0.9916096607767331, 0.997685481048375, ...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
1,Sess16_script03_User031M_043,"[[0.0, 0.9085915478151766, 0.9712338332854176,...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
2,Sess16_script03_User031M_043,"[[0.0, 0.987044980739589, 1.0, 0.9911646651235...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
3,Sess16_script03_User031M_043,"[[0.0, 0.8995780556614288, 0.9278067560683941,...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
4,Sess16_script03_User031M_043,"[[0.0, 0.9999999999999999, 0.9239672454180238,...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
...,...,...,...,...,...
3403,Sess29_script03_User058F_007,"[[0.0, 1.0, 0.571774423122406, 0.6082214713096...","[8, 487, 517, 22, 10, 3, 347, 91, 264, 362, 18...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
3404,Sess29_script03_User058F_007,"[[0.0, 1.0, 0.6651260256767273, 0.693627655506...","[8, 487, 517, 22, 10, 3, 347, 91, 264, 362, 18...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
3405,Sess29_script03_User058F_007,"[[0.0, 1.0, 0.6503502130508423, 0.680109441280...","[8, 487, 517, 22, 10, 3, 347, 91, 264, 362, 18...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
3406,Sess29_script03_User058F_007,"[[0.0, 1.0, 0.5764442682266235, 0.612493813037...","[8, 487, 517, 22, 10, 3, 347, 91, 264, 362, 18...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3


In [179]:
print(len(yes3),len(yes3df))

142 3408


In [180]:
yes3df.drop(random.sample(range(3408),50), axis=0, inplace=True)
yes3df

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
1,Sess16_script03_User031M_043,"[[0.0, 0.9085915478151766, 0.9712338332854176,...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
2,Sess16_script03_User031M_043,"[[0.0, 0.987044980739589, 1.0, 0.9911646651235...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
3,Sess16_script03_User031M_043,"[[0.0, 0.8995780556614288, 0.9278067560683941,...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
4,Sess16_script03_User031M_043,"[[0.0, 0.9999999999999999, 0.9239672454180238,...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
5,Sess16_script03_User031M_043,"[[0.0, 0.9565775754357857, 0.9012122301827206,...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
...,...,...,...,...,...
3402,Sess29_script03_User058F_007,"[[0.0, 0.9999998807907104, 0.6106435060501099,...","[8, 487, 517, 22, 10, 3, 347, 91, 264, 362, 18...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
3403,Sess29_script03_User058F_007,"[[0.0, 1.0, 0.571774423122406, 0.6082214713096...","[8, 487, 517, 22, 10, 3, 347, 91, 264, 362, 18...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
3405,Sess29_script03_User058F_007,"[[0.0, 1.0, 0.6503502130508423, 0.680109441280...","[8, 487, 517, 22, 10, 3, 347, 91, 264, 362, 18...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
3406,Sess29_script03_User058F_007,"[[0.0, 1.0, 0.5764442682266235, 0.612493813037...","[8, 487, 517, 22, 10, 3, 347, 91, 264, 362, 18...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3


In [182]:
dddd=pd.concat([yes3df,yes3,dddd]).reset_index()
dddd

Unnamed: 0,index,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,1,Sess16_script03_User031M_043,"[[0.0, 0.9085915478151766, 0.9712338332854176,...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
1,2,Sess16_script03_User031M_043,"[[0.0, 0.987044980739589, 1.0, 0.9911646651235...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
2,3,Sess16_script03_User031M_043,"[[0.0, 0.8995780556614288, 0.9278067560683941,...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
3,4,Sess16_script03_User031M_043,"[[0.0, 0.9999999999999999, 0.9239672454180238,...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
4,5,Sess16_script03_User031M_043,"[[0.0, 0.9565775754357857, 0.9012122301827206,...","[232, 5146, 1817, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",3
...,...,...,...,...,...,...
10495,3691,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.8472170003843945, 0.891401194892...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
10496,3692,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.8427829699385182, 0.888249465447...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
10497,3693,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.847416472603499, 0.8915429893159...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
10498,3694,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.841298580466981, 0.8871943493948...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2


In [184]:
dddd=dddd.drop(['index'],axis=1)

In [185]:
with open('yes.pkl', 'wb') as f:
    pickle.dump(dddd, f)

# 4번 감정
123개

In [186]:
from tqdm import tqdm
labels = yes4['sentiment_x']
x, y = [], []
file_name = ['./waaav/'+f+'.wav' for f in yes4['Segment ID']]
NUM=10
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM)
    #print(feature)
    for fe in feature:
        x.append(np.array(fe).transpose())
        y.append(label)

123it [09:17,  4.53s/it]


In [187]:
seg = [i for i in yes4['Segment ID'] for s in range(NUM*3)]
text = [i for i in yes4['text_tokenize'] for s in range(NUM*3)]
bio = [i for i in yes4['temp+eda'] for s in range(NUM*3)]


yes4df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes4df = pd.DataFrame(yes4df)

yes4df

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,Sess14_script02_User027F_021,"[[0.0, 1.0, 0.9622494284881353, 0.933926069427...","[241, 48, 9517, 4224, 53, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
1,Sess14_script02_User027F_021,"[[0.0, 1.0, 0.840259422100348, 0.8495329740236...","[241, 48, 9517, 4224, 53, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
2,Sess14_script02_User027F_021,"[[0.0, 1.0, 0.9753352494425849, 0.960476401111...","[241, 48, 9517, 4224, 53, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
3,Sess14_script02_User027F_021,"[[0.0, 1.0, 0.9289894338000558, 0.915593821101...","[241, 48, 9517, 4224, 53, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
4,Sess14_script02_User027F_021,"[[0.0, 0.9369320305181452, 0.9603673383450002,...","[241, 48, 9517, 4224, 53, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
...,...,...,...,...,...
3685,Sess13_script05_User025F_033,"[[0.0, 1.0, 0.8375838092122732, 0.881628888421...","[44, 18, 30, 1918, 1918, 1453, 0, 0, 0, 0, 0, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
3686,Sess13_script05_User025F_033,"[[0.0, 1.0, 0.837710494977476, 0.8817212150614...","[44, 18, 30, 1918, 1918, 1453, 0, 0, 0, 0, 0, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
3687,Sess13_script05_User025F_033,"[[0.0, 1.0, 0.8315404946687399, 0.877224433553...","[44, 18, 30, 1918, 1918, 1453, 0, 0, 0, 0, 0, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
3688,Sess13_script05_User025F_033,"[[0.0, 1.0, 0.8411692358342334, 0.884241990957...","[44, 18, 30, 1918, 1918, 1453, 0, 0, 0, 0, 0, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4


In [188]:
yes4df.drop(random.sample(range(3690),313), axis=0, inplace=True)
yes4df

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,Sess14_script02_User027F_021,"[[0.0, 1.0, 0.9622494284881353, 0.933926069427...","[241, 48, 9517, 4224, 53, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
1,Sess14_script02_User027F_021,"[[0.0, 1.0, 0.840259422100348, 0.8495329740236...","[241, 48, 9517, 4224, 53, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
2,Sess14_script02_User027F_021,"[[0.0, 1.0, 0.9753352494425849, 0.960476401111...","[241, 48, 9517, 4224, 53, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
3,Sess14_script02_User027F_021,"[[0.0, 1.0, 0.9289894338000558, 0.915593821101...","[241, 48, 9517, 4224, 53, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
5,Sess14_script02_User027F_021,"[[0.0, 0.9724042850703271, 1.0, 0.969571135436...","[241, 48, 9517, 4224, 53, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
...,...,...,...,...,...
3685,Sess13_script05_User025F_033,"[[0.0, 1.0, 0.8375838092122732, 0.881628888421...","[44, 18, 30, 1918, 1918, 1453, 0, 0, 0, 0, 0, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
3686,Sess13_script05_User025F_033,"[[0.0, 1.0, 0.837710494977476, 0.8817212150614...","[44, 18, 30, 1918, 1918, 1453, 0, 0, 0, 0, 0, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
3687,Sess13_script05_User025F_033,"[[0.0, 1.0, 0.8315404946687399, 0.877224433553...","[44, 18, 30, 1918, 1918, 1453, 0, 0, 0, 0, 0, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4
3688,Sess13_script05_User025F_033,"[[0.0, 1.0, 0.8411692358342334, 0.884241990957...","[44, 18, 30, 1918, 1918, 1453, 0, 0, 0, 0, 0, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",4


In [189]:
dddd=pd.concat([yes4df,yes4,dddd]).reset_index()
dddd=dddd.drop(['index'],axis=1)

In [191]:
with open('yes.pkl', 'wb') as f:
    pickle.dump(dddd, f)

# 5번 감정
79개

In [192]:
from tqdm import tqdm
labels = yes5['sentiment_x']
x, y = [], []
file_name = ['./waaav/'+f+'.wav' for f in yes5['Segment ID']]
NUM=15
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM)
    #print(feature)
    for fe in feature:
        x.append(np.array(fe).transpose())
        y.append(label)

79it [07:51,  5.97s/it]


In [193]:
seg = [i for i in yes5['Segment ID'] for s in range(NUM*3)]
text = [i for i in yes5['text_tokenize'] for s in range(NUM*3)]
bio = [i for i in yes5['temp+eda'] for s in range(NUM*3)]


yes5df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes5df = pd.DataFrame(yes5df)

yes5df

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,Sess04_script04_User007M_031,"[[0.0, 0.9720279137374699, 0.9940585600662198,...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
1,Sess04_script04_User007M_031,"[[0.0, 0.9776601666003755, 0.9806709714977366,...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
2,Sess04_script04_User007M_031,"[[0.0, 1.0, 0.8350003832443632, 0.992586174190...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
3,Sess04_script04_User007M_031,"[[0.0, 0.9360795995462733, 0.9738883352859978,...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
4,Sess04_script04_User007M_031,"[[0.0, 0.977750839399053, 0.9672768288315713, ...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
...,...,...,...,...,...
3550,Sess10_script02_User019F_007,"[[0.0, 0.9999999403953552, 0.858040988445282, ...","[9, 384, 133, 149, 66, 3305, 27, 16, 230, 7, 2...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
3551,Sess10_script02_User019F_007,"[[0.0, 1.0, 0.8484830856323242, 0.858624935150...","[9, 384, 133, 149, 66, 3305, 27, 16, 230, 7, 2...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
3552,Sess10_script02_User019F_007,"[[0.0, 1.0, 0.8621687293052673, 0.871394574642...","[9, 384, 133, 149, 66, 3305, 27, 16, 230, 7, 2...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
3553,Sess10_script02_User019F_007,"[[0.0, 1.0, 0.8606747984886169, 0.870000660419...","[9, 384, 133, 149, 66, 3305, 27, 16, 230, 7, 2...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5


In [196]:
yes5df.drop(random.sample(range(3555),134), axis=0, inplace=True)
yes5df

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,Sess04_script04_User007M_031,"[[0.0, 0.9720279137374699, 0.9940585600662198,...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
1,Sess04_script04_User007M_031,"[[0.0, 0.9776601666003755, 0.9806709714977366,...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
2,Sess04_script04_User007M_031,"[[0.0, 1.0, 0.8350003832443632, 0.992586174190...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
3,Sess04_script04_User007M_031,"[[0.0, 0.9360795995462733, 0.9738883352859978,...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
4,Sess04_script04_User007M_031,"[[0.0, 0.977750839399053, 0.9672768288315713, ...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
...,...,...,...,...,...
3550,Sess10_script02_User019F_007,"[[0.0, 0.9999999403953552, 0.858040988445282, ...","[9, 384, 133, 149, 66, 3305, 27, 16, 230, 7, 2...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
3551,Sess10_script02_User019F_007,"[[0.0, 1.0, 0.8484830856323242, 0.858624935150...","[9, 384, 133, 149, 66, 3305, 27, 16, 230, 7, 2...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
3552,Sess10_script02_User019F_007,"[[0.0, 1.0, 0.8621687293052673, 0.871394574642...","[9, 384, 133, 149, 66, 3305, 27, 16, 230, 7, 2...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
3553,Sess10_script02_User019F_007,"[[0.0, 1.0, 0.8606747984886169, 0.870000660419...","[9, 384, 133, 149, 66, 3305, 27, 16, 230, 7, 2...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5


In [197]:
dddd=pd.concat([yes5df,yes5,dddd]).reset_index()
dddd=dddd.drop(['index'],axis=1)
dddd

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,Sess04_script04_User007M_031,"[[0.0, 0.9720279137374699, 0.9940585600662198,...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
1,Sess04_script04_User007M_031,"[[0.0, 0.9776601666003755, 0.9806709714977366,...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
2,Sess04_script04_User007M_031,"[[0.0, 1.0, 0.8350003832443632, 0.992586174190...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
3,Sess04_script04_User007M_031,"[[0.0, 0.9360795995462733, 0.9738883352859978,...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
4,Sess04_script04_User007M_031,"[[0.0, 0.977750839399053, 0.9672768288315713, ...","[21, 5022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",5
...,...,...,...,...,...
17495,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.8472170003843945, 0.891401194892...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
17496,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.8427829699385182, 0.888249465447...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
17497,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.847416472603499, 0.8915429893159...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
17498,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.841298580466981, 0.8871943493948...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2


In [198]:
with open('yes.pkl', 'wb') as f:
    pickle.dump(dddd, f)

# 6번 감정
35개

In [199]:
from tqdm import tqdm
labels = yes6['sentiment_x']
x, y = [], []
file_name = ['./waaav/'+f+'.wav' for f in yes6['Segment ID']]
NUM=33
for f, label in tqdm(zip(file_name, labels)):
    
    feature = get_features(f, NUM)
    #print(feature)
    for fe in feature:
        x.append(np.array(fe).transpose())
        y.append(label)

35it [08:58, 15.40s/it]


In [203]:
seg = [i for i in yes6['Segment ID'] for s in range(NUM*3)]
text = [i for i in yes6['text_tokenize'] for s in range(NUM*3)]
bio = [i for i in yes6['temp+eda'] for s in range(NUM*3)]


yes6df={'Segment ID': seg,'mfcc_scaled':x,'text_tokenize':text, 'temp+eda':bio,'sentiment_x':y}
yes6df = pd.DataFrame(yes6df)

yes6df

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,Sess13_script06_User026F_019,"[[0.0, 0.9999999999999999, 0.9608148846764442,...","[331, 29, 9462, 54, 4, 132, 70, 49, 3392, 9463...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
1,Sess13_script06_User026F_019,"[[0.0, 0.8893497060147615, 0.9251501458319147,...","[331, 29, 9462, 54, 4, 132, 70, 49, 3392, 9463...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
2,Sess13_script06_User026F_019,"[[0.0, 0.9672434656487046, 0.9493295501081099,...","[331, 29, 9462, 54, 4, 132, 70, 49, 3392, 9463...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
3,Sess13_script06_User026F_019,"[[0.0, 0.8324701301043231, 0.8910219154763063,...","[331, 29, 9462, 54, 4, 132, 70, 49, 3392, 9463...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
4,Sess13_script06_User026F_019,"[[0.0, 0.8692361742952204, 0.8943141198396264,...","[331, 29, 9462, 54, 4, 132, 70, 49, 3392, 9463...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
...,...,...,...,...,...
3460,Sess13_script06_User026F_018,"[[0.0, 1.0, 0.8403886809532284, 0.849320074636...","[987, 747, 21, 3392, 2441, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
3461,Sess13_script06_User026F_018,"[[0.0, 1.0, 0.8366358549832241, 0.845777246092...","[987, 747, 21, 3392, 2441, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
3462,Sess13_script06_User026F_018,"[[0.0, 1.0, 0.8352059675074612, 0.844427371961...","[987, 747, 21, 3392, 2441, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
3463,Sess13_script06_User026F_018,"[[0.0, 1.0, 0.8479802288012779, 0.856486803155...","[987, 747, 21, 3392, 2441, 0, 0, 0, 0, 0, 0, 0...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6


In [204]:
dddd=pd.concat([yes6df,yes6,dddd]).reset_index()
dddd=dddd.drop(['index'],axis=1)
dddd

Unnamed: 0,Segment ID,mfcc_scaled,text_tokenize,temp+eda,sentiment_x
0,Sess13_script06_User026F_019,"[[0.0, 0.9999999999999999, 0.9608148846764442,...","[331, 29, 9462, 54, 4, 132, 70, 49, 3392, 9463...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
1,Sess13_script06_User026F_019,"[[0.0, 0.8893497060147615, 0.9251501458319147,...","[331, 29, 9462, 54, 4, 132, 70, 49, 3392, 9463...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
2,Sess13_script06_User026F_019,"[[0.0, 0.9672434656487046, 0.9493295501081099,...","[331, 29, 9462, 54, 4, 132, 70, 49, 3392, 9463...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
3,Sess13_script06_User026F_019,"[[0.0, 0.8324701301043231, 0.8910219154763063,...","[331, 29, 9462, 54, 4, 132, 70, 49, 3392, 9463...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
4,Sess13_script06_User026F_019,"[[0.0, 0.8692361742952204, 0.8943141198396264,...","[331, 29, 9462, 54, 4, 132, 70, 49, 3392, 9463...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",6
...,...,...,...,...,...
20995,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.8472170003843945, 0.891401194892...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
20996,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.8427829699385182, 0.888249465447...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
20997,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.847416472603499, 0.8915429893159...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2
20998,Sess16_script06_User031M_022,"[[0.0, 1.0, 0.841298580466981, 0.8871943493948...","[15, 25, 3396, 1124, 9, 10005, 23, 8, 129, 8, ...","[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....",2


In [205]:
with open('yes.pkl', 'wb') as f:
    pickle.dump(dddd, f)

In [207]:
yes0=tmp[tmp['sentiment_x']==0]

In [211]:
yes0=yes0.reset_index()

In [212]:
yes0.drop(random.sample(range(7393),3893), axis=0, inplace=True)
yes0

Unnamed: 0,index,Segment ID,temp+eda,text_tokenize,mfcc_scaled,sentiment_x
1,81,Sess01_script02_User002M_021,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[75, 17, 43, 27, 209, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.39622842525417723, 1.0, 0.7046509808430651...",0
2,4524,Sess15_script02_User029M_030,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[105, 11, 157, 341, 90, 532, 2, 20, 2, 827, 17...","[[0.004803048548005395, 0.4873190168037505, 0....",0
5,3675,Sess13_script01_User025F_002,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[8, 5, 2, 196, 850, 3, 520, 186, 6, 0, 0, 0, 0...","[[0.5196198731227133, 0.6942348173289606, 0.68...",0
9,1608,Sess06_script01_User011M_035,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[55, 1050, 108, 46, 45, 20, 2, 177, 30, 4, 6, ...","[[0.0, 0.42532619085649165, 0.755734512129753,...",0
10,42,Sess01_script01_User002M_030,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[15, 1234, 12, 287, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0.0, 0.44962945315611874, 0.3926550680216022...",0
...,...,...,...,...,...,...
7379,10762,Sess35_script04_User070F_008,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[42, 262, 1, 2859, 326, 32, 2859, 100, 344, 11...","[[0.032945521331930516, 0.40808203802457316, 0...",0
7381,9329,Sess31_script06_User061M_026,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[1819, 224, 248, 12, 1140, 0, 0, 0, 0, 0, 0, 0...","[[0.0, 0.489686164522752, 0.8910410850529435, ...",0
7383,10297,Sess34_script02_User068M_020,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[534, 3934, 3934, 46, 1377, 4, 1618, 0, 0, 0, ...","[[0.0012534706618411162, 0.4571524437803839, 0...",0
7387,8121,Sess27_script06_User054M_001,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[15, 87, 722, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0.09887480369794877, 0.5645720231907226, 0.7...",0


In [215]:
yes0=yes0.drop(['index'],axis=1)
yes0

Unnamed: 0,Segment ID,temp+eda,text_tokenize,mfcc_scaled,sentiment_x
1,Sess01_script02_User002M_021,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[75, 17, 43, 27, 209, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[0.39622842525417723, 1.0, 0.7046509808430651...",0
2,Sess15_script02_User029M_030,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[105, 11, 157, 341, 90, 532, 2, 20, 2, 827, 17...","[[0.004803048548005395, 0.4873190168037505, 0....",0
5,Sess13_script01_User025F_002,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[8, 5, 2, 196, 850, 3, 520, 186, 6, 0, 0, 0, 0...","[[0.5196198731227133, 0.6942348173289606, 0.68...",0
9,Sess06_script01_User011M_035,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[55, 1050, 108, 46, 45, 20, 2, 177, 30, 4, 6, ...","[[0.0, 0.42532619085649165, 0.755734512129753,...",0
10,Sess01_script01_User002M_030,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[15, 1234, 12, 287, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0.0, 0.44962945315611874, 0.3926550680216022...",0
...,...,...,...,...,...
7379,Sess35_script04_User070F_008,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[42, 262, 1, 2859, 326, 32, 2859, 100, 344, 11...","[[0.032945521331930516, 0.40808203802457316, 0...",0
7381,Sess31_script06_User061M_026,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[1819, 224, 248, 12, 1140, 0, 0, 0, 0, 0, 0, 0...","[[0.0, 0.489686164522752, 0.8910410850529435, ...",0
7383,Sess34_script02_User068M_020,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[534, 3934, 3934, 46, 1377, 4, 1618, 0, 0, 0, ...","[[0.0012534706618411162, 0.4571524437803839, 0...",0
7387,Sess27_script06_User054M_001,"[35.07, 35.07, 35.07, 35.07, 35.07, 35.07, 35....","[15, 87, 722, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0.09887480369794877, 0.5645720231907226, 0.7...",0


In [218]:
abc=pd.concat([dddd,yes0]).reset_index()
with open('audio_augmented_per3500.pkl', 'wb') as f:
    pickle.dump(abc, f)