In [1]:
import os
from glob import glob
import shutil
import librosa
import librosa.display
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

# 전처리

In [2]:
for audio in glob('./wav/*/*/*.wav'):
    new_audio = audio.split('\\')[-1]
    shutil.move(audio, f'./audio/{new_audio}')

In [3]:
train_df = pd.DataFrame(columns=['session', 'type_num', 'gender_num'])
valid_df = pd.DataFrame(columns=['session', 'type_num', 'gender_num'])
test_df = pd.DataFrame(columns=['session', 'type_num', 'gender_num'])

for audio in os.listdir('./audio'):
    session, type, gender_num  = audio.split('_')
    gender_num = gender_num.split('.')[0]
    if session in ['Sess19','Sess20']:
        test_df = train_df.append({'session':session, 'type_num':type, 'gender_num':gender_num}, ignore_index=True)
    elif session in ['Sess17', 'Sess18']:
        valid_df = valid_df.append({'session':session, 'type_num':type, 'gender_num':gender_num}, ignore_index=True)
    else:
        train_df = train_df.append({'session':session, 'type_num':type, 'gender_num':gender_num}, ignore_index=True)

In [7]:
train_df.to_csv('train.csv', index=False)
valid_df.to_csv('valid.csv', index=False)
test_df.to_csv('test.csv', index=False)

# 여기부터 연습코드

In [5]:
audio, sr = librosa.load('./audio/Sess01_impro01_F001.wav', sr=16000)
audio.shape

(49962,)

In [5]:
audio, sr = librosa.load('./audio/Sess01_impro01_F003.wav', sr=16000)
audio.shape

(58800,)

In [6]:
audio

array([ 0.006073  ,  0.00592041,  0.0017395 , ..., -0.02334595,
       -0.02182007, -0.01806641], dtype=float32)

In [14]:
torch.from_numpy(audio).unsqueeze(0)

tensor([[-0.0007, -0.0010,  0.0006,  ..., -0.0060, -0.0040,  0.0000]])

In [7]:
torch.from_numpy(audio)

tensor([ 0.0061,  0.0059,  0.0017,  ..., -0.0233, -0.0218, -0.0181])

In [1]:
from dataset import EmotionDataset
from torch.utils.data import DataLoader

data = EmotionDataset(split='valid', num=1)
loader = DataLoader(data, batch_size=3)

In [2]:
data.__getitem__(0)

{'audio': array([-0.00189209, -0.0010376 , -0.00164795, ...,  0.        ,
         0.        ,  0.        ], dtype=float32),
 'speaker': 6,
 'listener': 5}

In [12]:
df = pd.read_csv('./test.csv')

In [3]:
for a in loader:
    print(a)
    break

{'audio': tensor([[-0.0019, -0.0010, -0.0016,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0007, -0.0008, -0.0006,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0013, -0.0009, -0.0007,  ...,  0.0000,  0.0000,  0.0000]]), 'speaker': tensor([6, 4, 4]), 'listener': tensor([5, 5, 2])}


In [16]:
r = np.random.RandomState(42)

In [22]:
begin = r.randint(10)
begin

9

In [6]:
annot = pd.read_csv('./C.csv', encoding='cp949')

In [12]:
annot[(annot['Segment ID']=='Sess01_script01_M001') & (annot['role']=='speaker')]['Emotion.1'].values

array(['surprise'], dtype=object)

In [4]:
label_dict = {'disgust':0, 'angry':1, 'sad':2, 'fear':3, 
'surprise':4,'neutral':5, 'happy':6}
label_dict

{'disgust': 0,
 'angry': 1,
 'sad': 2,
 'fear': 3,
 'surprise': 4,
 'neutral': 5,
 'happy': 6}

In [10]:
import torch
import torch.nn as nn
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, dim_batch_first=True)
src = torch.rand(3, 512)
out = encoder_layer(src)

In [11]:
out.shape

torch.Size([3, 512])

In [12]:
with open('./textdata_segID.txt', 'r', encoding='cpc') as f:
    f.readlines()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

In [89]:
import pickle
with open('./textdata_segID.txt', 'rb') as lf:
    text_ID = pickle.load(lf)

with open('./embedding_textdata.txt', 'rb') as lf:
    embedding = pickle.load(lf)

In [90]:
len(text_ID)

10284

In [91]:
len(embedding)

10284

In [92]:
text_emb_dic = {}
for id, emb in zip(text_ID, embedding):
    text_emb_dic[id] = emb

In [93]:
import numpy as np

In [94]:
np.save('text_emb_dict.npy', text_emb_dic)

In [11]:
text_emb_dic = np.load('text_emb_dic.npy', allow_pickle=True).item()
text_emb_dic['Sess17_impro01_F001']

KeyError: 'Sess17_impro01_F001'

In [12]:
len(text_emb_dic)

8265

In [5]:
from dataset import EmotionDataset
from torch.utils.data import DataLoader

In [22]:
train_data = EmotionDataset('test', 1)
train_loader = DataLoader(train_data, batch_size=3)

In [23]:
for pack in train_loader:
    break

In [24]:
audio = pack['audio']
text = pack['text']
emo_s = pack['speaker']
emo_l = pack['listener']

In [25]:
print(audio)
print(text)
print(emo_s)
print(emo_l)


tensor([[ 0.0012,  0.0010, -0.0007,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0007, -0.0006,  0.0016,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0061,  0.0059,  0.0017,  ...,  0.0000,  0.0000,  0.0000]])
tensor([[   2, 3206, 6844, 3945, 5595, 6150, 1258, 5793, 6060, 6393,  517,   54,
         3220, 7389, 7767, 2584, 7123, 3105, 7328, 2049, 5761, 1185, 6043, 1189,
          517, 5330, 5859, 1765,  862, 3098,  633,    3,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1],
        [   2, 1198, 5771,  517, 7226, 5330, 6015, 2298, 5357, 6016, 5400,  517,
           46, 3220, 3502, 1934, 7318, 2298, 7126, 7100,  517, 6005, 1407, 7101,
         6553, 4249, 7871, 5377, 6844,  633,    3,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,

In [38]:
audio.shape

torch.Size([3, 160000])

In [27]:
text.shape

torch.Size([3, 64])

In [57]:
import torch
lstm = nn.LSTM(64, 128, 1, batch_first=True)
lstm(text.type(torch.float))

(tensor([[ 7.6159e-01, -7.6159e-01, -0.0000e+00,  0.0000e+00,  7.6159e-01,
          -0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -7.6159e-01,
           0.0000e+00, -7.6159e-01,  7.6159e-01, -7.6159e-01, -7.6159e-01,
          -0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  7.6159e-01,
           0.0000e+00,  0.0000e+00,  0.0000e+00, -0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  7.6159e-01,
           7.5194e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00, -0.0000e+00, -7.6159e-01,
           0.0000e+00,  0.0000e+00,  7.6159e-01,  0.0000e+00, -7.6159e-01,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -7.6159e-01,
           0.0000e+00,  0.0000e+00, -0.0000e+00,  0.0000e+00, -0.0000e+00,
           7.6159e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00, -0

In [46]:
from constants import *
import torchaudio
import torch.nn as nn

spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=SAMPLE_RATE,
            n_fft=N_FFT,
            hop_length=HOP_SIZE,
            f_min=F_MIN,
            f_max=F_MAX,
            n_mels=N_MELS,
            normalized=False)

to_db = torchaudio.transforms.AmplitudeToDB()
spec_bn = nn.BatchNorm2d(1)

In [47]:
audio_emb = spec(audio)
audio_emb.shape

torch.Size([3, 229, 313])

In [48]:
audio_emb = to_db(audio_emb)
audio_emb.shape

torch.Size([3, 229, 313])

In [45]:
audio_emb = spec_bn(audio_emb)
audio_emb.shape

RuntimeError: running_mean should contain 229 elements not 1

In [50]:
class BasicConv1d(nn.Module):
    def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0):
        super(BasicConv1d, self).__init__()
        self.conv = nn.Conv1d(in_planes, out_planes,
                              kernel_size=kernel_size, stride=stride,
                              padding=padding, bias=False)
        self.bn = nn.BatchNorm1d(out_planes)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool1d(2)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.maxpool(x)
        return x

audio_emb_layer = nn.Sequential(
            BasicConv1d(1, 64, (1, 3)),
            BasicConv1d(64, 100, (1, 5)),
            BasicConv1d(100, 100, (1, 7))
            )

In [51]:
audio_emb = audio_emb_layer(audio)
audio_emb.shape

RuntimeError: Expected 4-dimensional input for 4-dimensional weight [64, 1, 1, 3], but got 3-dimensional input of size [1, 3, 160000] instead

In [52]:
audio.shape

torch.Size([3, 160000])

In [58]:
import pandas as pd
annot = pd.read_csv('./C.csv', encoding='cp949')

In [63]:
data = annot[(annot['Segment ID']=='Sess01_script01_M001') & (annot['role']=='speaker')]

In [72]:
emo_list = []
for i in range(10):
    emo = data['Emotion.'+ str(i+1)].values[0]
    emo_list.append(label_dict[emo])
    

    # print(data['Emotion.'+ str(i+1)].values)
    # break

In [79]:
emo_list

[4, 4, 4, 4, 3, 4, 4, 4, 4, 4]

In [71]:
label_dict = {'disgust':0, 'angry':1, 'sad':2, 'fear':3, 'surprise':4,'neutral':5, 'happy':6}

In [87]:
soft_label = np.zeros(7)
for i in range(7):
    soft_label[i] = emo_list.count(i)/10

In [88]:
soft_label

array([0. , 0. , 0. , 0.1, 0.9, 0. , 0. ])

# 3. 여기부터 성래가

## 3.1 Speaker, Listener 분리하기

In [42]:
import pandas as pd

df = pd.read_csv('./data/KEMDy19/annotation.csv')
del df['Unnamed: 0'], df['Unnamed: 0.1'] , df['Unnamed: 0.1.1']

df_speaker = df[df['role']=='speaker']
df_listener = df[df['role']=='listener']

print(len(df_speaker)) # 10279
print(len(df_listener)) # 10287
print(len(df),len(df_listener)+len(df_speaker)) # 같이야한다

10279
10287
20566 20566


In [43]:
df_listener = df_listener.drop(df_listener[df_listener['Segment ID']=='Sess04_impro03_F031'].index)
df_listener = df_listener.drop(df_listener[df_listener['Segment ID']=='Sess04_impro03_M031'].index)
df_listener[df_listener['Segment ID']=='Sess04_impro03_F031']

Unnamed: 0,start,end,Segment ID,Emotion,Valence,Arousal,Emotion.1,Valence.1,Arousal.1,Emotion.2,...,Emotion.10,Valence.10,Arousal.10,speaker,text,role,Session,script,sequence,dialogue


In [44]:
df_speaker.to_csv('./data/KEMDy19/df_speaker.csv',index=False,encoding='utf-8')
df_listener.to_csv('./data/KEMDy19/df_listener.csv',index=False,encoding='utf-8')

# df_speaker

In [39]:
df_speaker = pd.read_csv('./data/KEMDy19/df_speaker.csv')

df_speaker[df_speaker['Segment ID']=='Sess01_script01_F002']

Unnamed: 0,start,end,Segment ID,Emotion,Valence,Arousal,Emotion.1,Valence.1,Arousal.1,Emotion.2,...,Emotion.10,Valence.10,Arousal.10,speaker,text,role,Session,script,sequence,dialogue
4,28.943,32.167,Sess01_script01_F002,fear,1.6,3.7,fear,3,4,fear,...,fear,2,3,F,지섭씨. 일단 112에 신고하자.,speaker,Sess01,script01,F002,Sess01_script01


In [111]:
emotion = torch.zeros((7))
emotion[2] += 1
emotion[2] += 1
emotion = emotion/10
emotion = torch.FloatTensor(emotion)
type(emotion)

torch.Tensor

In [73]:
import pandas as pd

temp = pd.DataFrame(columns=['seg'])
temp = temp.append({'seg' : ['1','2']},ignore_index=True)
temp['seg'][0]

['1', '2']

## 3.2 ws=8일때 데이터 리스트 뽑아 저장하기

In [67]:
def _get_ws_list(df,ws):
        ws_df = pd.DataFrame(columns=['seq'])
        ws_list = ['padding' for _ in range(0,ws-1)]
        seed_id = df['Segment ID'][1]
        for id in df['Segment ID']:
            if seed_id.split('_')[0:2] == id.split('_')[0:2]: #같은 dialog라면
                ws_list.append(id)
                seed_id = id
            else:
                # ws 만큼 잘라서 ws_df에 append
                for i in range(0,len(ws_list)-ws+1):
                    ws_df = ws_df.append({'seq' : ws_list[i:i+ws]},ignore_index=True)
                ws_list = ['padding' for _ in range(0,ws-1)]
                seed_id = id
                ws_list.append(id)
        else: # 마지막 다이얼로그의 위의 else문에 들어갈 수 없으므로 따로 처리
            for i in range(0,len(ws_list)-ws+1):
                    ws_df = ws_df.append({'seq' : ws_list[i:i+ws]},ignore_index=True)
        return ws_df

In [106]:
ws_df = _get_ws_list(df_speaker,8)
ws_df.to_csv('./data/KEMDy19/df_speaker_8.csv',index=False,encoding='utf-8')

In [79]:
import random
audio, sr = librosa.load('./audio/Sess01_impro01_F001.wav', sr=16000)
audio_len = audio.shape[0]
if audio_len < sr*10: # 10초가 안되는 음성이면
    audio = librosa.util.fix_length(audio, size=sr*10)
else: # 10초가 넘는 음성이면
    start = random.randint(audio_len - sr*10)
    end = start + sr*10
    audio = audio[start:end]


In [119]:
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms
from constants import F_MAX, F_MIN, HOP_SIZE, N_FFT, N_MELS, SAMPLE_RATE
from torchsummary import summary


spec = torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE,
                                                        n_fft=N_FFT,
                                                        hop_length=HOP_SIZE,
                                                        f_min=F_MIN,
                                                        f_max=F_MAX,
                                                        n_mels=N_MELS,
                                                        normalized=False)

In [18]:
import numpy as np

text_emb_dic = np.load('./data/KEMDy19/embedding_768.npy', allow_pickle=True).item()


In [25]:
a = [2,4]
m = [1,2,3,4,5]

# [item for item in x if item not in y]
[session for session in m if session not in a]


[1, 3, 5]

In [38]:
import argparse
import ast

test_split = str([1,2,3,4])
l = ast.literal_eval(test_split)
l[0]

1

In [65]:
import torch

y = torch.FloatTensor([1.0,0,0,1,0,1,0])
pred = torch.rand((7))
print(pred)
torch.flip(torch.argsort(pred,dim=-1),dims=(-1,))[0:3].tolist()

tensor([0.4100, 0.1979, 0.7392, 0.2015, 0.9846, 0.2317, 0.6167])


[4, 2, 6]

In [69]:
y = torch.Tensor([0,3,5])
pred = torch.Tensor([2,5,3])


pred[0]

tensor(2.)

In [76]:
a = torch.ones([7,7])
a = torch.cat((a,torch.zeros(1,7)),dim=0)
a[:,0]

tensor([1., 1., 1., 1., 1., 1., 1., 0.])