In [164]:
import os
import sys
import re
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer,LancasterStemmer,WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk_data.corpora import wordnet
from imblearn.over_sampling import SMOTE

ImportError: Could not find 'cudart64_90.dll'. TensorFlow requires that this DLL be installed in a directory that is named in your %PATH% environment variable. Download and install CUDA 9.0 from this URL: https://developer.nvidia.com/cuda-toolkit

# IEMOCAP Dataset - Emotion extraction

In [86]:
file_path = 'C:\python\\ten1.9\\google_stt\\dataset\\data\\Session1\\dialog\\EmoEvaluation\\Ses01F_impro01.txt'

In [87]:
useful_regex = re.compile(r'\[.+\]\n', re.IGNORECASE) #특수문자에서 시작하여 \n까지 검사

with open(file_path) as f:
    file_content = f.read()
    
info_lines = re.findall(useful_regex, file_content)

info_lines

['[START_TIME - END_TIME] TURN_NAME EMOTION [V, A, D]\n',
 '[6.2901 - 8.2357]\tSes01F_impro01_F000\tneu\t[2.5000, 2.5000, 2.5000]\n',
 '[10.0100 - 11.3925]\tSes01F_impro01_F001\tneu\t[2.5000, 2.5000, 2.5000]\n',
 '[14.8872 - 18.0175]\tSes01F_impro01_F002\tneu\t[2.5000, 2.5000, 2.5000]\n',
 '[19.2900 - 20.7875]\tSes01F_impro01_F003\txxx\t[2.5000, 3.0000, 3.0000]\n',
 '[21.3257 - 24.7400]\tSes01F_impro01_F004\txxx\t[2.5000, 3.0000, 2.5000]\n',
 '[27.4600 - 31.4900]\tSes01F_impro01_F005\tneu\t[2.5000, 3.5000, 2.0000]\n',
 '[38.9650 - 43.5900]\tSes01F_impro01_F006\tfru\t[2.0000, 3.5000, 3.5000]\n',
 '[46.5800 - 52.1900]\tSes01F_impro01_F007\tfru\t[2.5000, 3.5000, 3.5000]\n',
 '[56.1600 - 58.8225]\tSes01F_impro01_F008\tfru\t[2.0000, 3.5000, 3.5000]\n',
 '[61.8700 - 65.9700]\tSes01F_impro01_F009\tfru\t[2.0000, 3.5000, 3.0000]\n',
 '[66.4200 - 69.3400]\tSes01F_impro01_F010\txxx\t[1.5000, 3.5000, 3.5000]\n',
 '[72.4500 - 82.2600]\tSes01F_impro01_F011\tfru\t[2.0000, 3.5000, 3.5000]\n',
 '[85.27

In [88]:
for l in info_lines[1:10]:
    print(l.strip().split('\t'))

['[6.2901 - 8.2357]', 'Ses01F_impro01_F000', 'neu', '[2.5000, 2.5000, 2.5000]']
['[10.0100 - 11.3925]', 'Ses01F_impro01_F001', 'neu', '[2.5000, 2.5000, 2.5000]']
['[14.8872 - 18.0175]', 'Ses01F_impro01_F002', 'neu', '[2.5000, 2.5000, 2.5000]']
['[19.2900 - 20.7875]', 'Ses01F_impro01_F003', 'xxx', '[2.5000, 3.0000, 3.0000]']
['[21.3257 - 24.7400]', 'Ses01F_impro01_F004', 'xxx', '[2.5000, 3.0000, 2.5000]']
['[27.4600 - 31.4900]', 'Ses01F_impro01_F005', 'neu', '[2.5000, 3.5000, 2.0000]']
['[38.9650 - 43.5900]', 'Ses01F_impro01_F006', 'fru', '[2.0000, 3.5000, 3.5000]']
['[46.5800 - 52.1900]', 'Ses01F_impro01_F007', 'fru', '[2.5000, 3.5000, 3.5000]']
['[56.1600 - 58.8225]', 'Ses01F_impro01_F008', 'fru', '[2.0000, 3.5000, 3.5000]']


## 1. Text data and emotion extraction

### 1.1 Text data and emotion extraction

In [89]:
info_line = re.compile(r'\[.+\]\n', re.IGNORECASE)

start_times, end_times,wav_file_names,emotions,vals,acts,doms = [],[],[],[],[],[],[]

for sess in [5]:
    emo_dataset_dir = 'C:\python\\ten1.9\\google_stt\\dataset\\data\\Session{}\\dialog\\EmoEvaluation\\'.format(sess)
    evaluation_files = [l for l in os.listdir(emo_dataset_dir) if 'Ses' in l]
    for file in evaluation_files:
        with open(emo_dataset_dir + file) as f:
            content = f.read()
        info_lines = re.findall(info_line, content)
        for line in info_lines[1:]:  # the first line is a header
            start_end_time, wav_file_name, emotion, val_act_dom = line.strip().split('\t')
            start_time, end_time = start_end_time[1:-1].split('-')
            val, act, dom = val_act_dom[1:-1].split(',')
            val, act, dom = float(val), float(act), float(dom)
            start_time, end_time = float(start_time), float(end_time)
            start_times.append(start_time)
            end_times.append(end_time)
            wav_file_names.append(wav_file_name)
            emotions.append(emotion)
            vals.append(val)
            acts.append(act)
            doms.append(dom)
            

In [90]:
df_emotion = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 'emotion', 'val', 'act', 'dom'])
df_emotion['start_time'] = start_times
df_emotion['end_time'] = end_times
df_emotion['wav_file'] = wav_file_names
df_emotion['emotion'] = emotions
df_emotion['val'] = vals
df_emotion['act'] = acts
df_emotion['dom'] = doms

df_emotion.head()

Unnamed: 0,start_time,end_time,wav_file,emotion,val,act,dom
0,3.6132,6.17,Ses05F_impro01_F000,neu,4.0,2.5,3.0
1,14.15,19.49,Ses05F_impro01_F001,fru,2.5,3.0,3.0
2,22.85,26.9,Ses05F_impro01_F002,fru,2.5,3.0,3.5
3,29.98,34.46,Ses05F_impro01_F003,fru,2.5,3.0,3.0
4,39.48,42.18,Ses05F_impro01_F004,fru,2.5,3.5,3.0


In [136]:
df_emotion.to_csv('./dataset/preprocess/emotion_data.csv', index=False)

### 1.2 Text data and emotion extraction

In [94]:
textdata_path = 'C:\python\\ten1.9\\google_stt\\dataset\\data\Session1\dialog\\transcriptions\\Ses01F_impro01.txt' #예시 텍스트 데이터 경로

useful_regex = re.compile(r'.*[.+\S]\n', re.IGNORECASE) # 정규식 생성 (모든문자에 대해서 반복하되 \n까지 검출)

with open(textdata_path) as f:
    file_content = f.read()
    
info_line = re.findall(useful_regex, file_content) #위의 정규식에 적합한 데이터 찾기

file_content

"Ses01F_impro01_F000 [006.2901-008.2357]: Excuse me.\nSes01F_impro01_M000 [007.5712-010.4750]: Do you have your forms?\nSes01F_impro01_F001 [010.0100-011.3925]: Yeah.\nSes01F_impro01_M001 [010.9266-014.6649]: Let me see them.\nSes01F_impro01_F002 [014.8872-018.0175]: Is there a problem?\nSes01F_impro01_M002 [016.8352-019.7175]: Who told you to get in this line?\nSes01F_impro01_F003 [019.2900-020.7875]: You did.\nSes01F_impro01_F004 [021.3257-024.7400]: You were standing at the beginning and you directed me.\nSes01F_impro01_M003 [023.4700-028.0300]: Okay. But I didn't tell you to get in this line if you are filling out this particular form.\nSes01F_impro01_F005 [027.4600-031.4900]: Well what's the problem?  Let me change it.\nSes01F_impro01_M004 [028.3950-031.2117]: This form is a Z.X.four.\nSes01F_impro01_M005 [031.2660-039.3875]: You can't--  This is not the line for Z.X.four.  If you're going to fill out the Z.X.four, you need to have a different form of ID.\nSes01F_impro01_F006 [038

In [101]:

for l in info_line[0:10]:
    start_end_time = l.split(':')[0].strip( )
    start_end_time = start_end_time.split(' ')[1].strip( )
    a,b = start_end_time[1:-1].split('-')
    print(a,b)


006.2901 008.2357
007.5712 010.4750
010.0100 011.3925
010.9266 014.6649
014.8872 018.0175
016.8352 019.7175
019.2900 020.7875
021.3257 024.7400
023.4700 028.0300
027.4600 031.4900


In [127]:
text_line = re.compile(r'.*[.+\S]', re.IGNORECASE)

file_name_line = re.compile(r'\A\S*', re.I)
start_end_line = re.compile(r'\[.+]', re.I)

wav_file_names, start_times, end_times, texts = [],[],[],[]

for sess in [5]:
    text_dataset_dir = 'C:\python\\ten1.9\\google_stt\\dataset\\data\Session{}\dialog\\transcriptions\\'.format(sess)#텍스트 데이터셋 디렉토리
    text_files = [l for l in os.listdir(text_dataset_dir) if 'Ses' in l]#텍스트 파일 리스트 생성
    
    for file in text_files:
        with open(text_dataset_dir + file) as f:
            content = f.read()
        
        text_lines = re.findall(text_line, content)
        
        for line in text_lines[0:]: 
            start_end_time = line.split(':')[0].strip( )
            start_end_time = start_end_time.split(' ')[1].strip( )
            start_time, end_time= start_end_time[1:-1].split('-')
            
            wav_file_name = re.search(file_name_line,line)
            wav_file_name = wav_file_name.group()
            
            text = line.split(':')[1].strip( )
            
            start_times.append(start_time)
            end_times.append(end_time)
            texts.append(text)
            wav_file_names.append(wav_file_name)
            

In [128]:
df_text = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 'text'])
df_text['start_time'] = start_times
df_text['end_time'] = end_times
df_text['wav_file'] = wav_file_names
df_text['text'] = texts


df_text.head()


Unnamed: 0,start_time,end_time,wav_file,text
0,3.6132,6.17,Ses05F_impro01_F000,"Hi, I need an ID."
1,5.78,14.78,Ses05F_impro01_M000,"ahh Yeah, this is the wrong line. I'm sorry. ..."
2,13.0388,13.8112,Ses05F_impro01_FXX0,No.
3,14.15,19.49,Ses05F_impro01_F001,"Okay, I'm sorry, but I just stood in this line..."
4,17.64,24.16,Ses05F_impro01_M001,"I mean, there's really nothing I can do for yo..."


In [134]:
df_text.to_csv('./dataset/preprocess/text_data.csv', index=False)

### 1.3 data fusion

In [135]:
text_data = pd.read_csv('./dataset/preprocess/text_data.csv')
emotion_data = pd.read_csv('./dataset/preprocess/emotion_data.csv')

text_data.head()

Unnamed: 0,start_time,end_time,wav_file,text
0,3.6132,6.17,Ses05F_impro01_F000,"Hi, I need an ID."
1,5.78,14.78,Ses05F_impro01_M000,"ahh Yeah, this is the wrong line. I'm sorry. ..."
2,13.0388,13.8112,Ses05F_impro01_FXX0,No.
3,14.15,19.49,Ses05F_impro01_F001,"Okay, I'm sorry, but I just stood in this line..."
4,17.64,24.16,Ses05F_impro01_M001,"I mean, there's really nothing I can do for yo..."


In [137]:
emotion_data.head()

Unnamed: 0,start_time,end_time,wav_file,emotion,val,act,dom
0,3.6132,6.17,Ses05F_impro01_F000,neu,4.0,2.5,3.0
1,14.15,19.49,Ses05F_impro01_F001,fru,2.5,3.0,3.0
2,22.85,26.9,Ses05F_impro01_F002,fru,2.5,3.0,3.5
3,29.98,34.46,Ses05F_impro01_F003,fru,2.5,3.0,3.0
4,39.48,42.18,Ses05F_impro01_F004,fru,2.5,3.5,3.0


In [141]:
fusion_data = pd.merge(emotion_data,text_data, how="inner" ,on=["start_time","end_time","wav_file"])

In [143]:
fusion_data

Unnamed: 0,start_time,end_time,wav_file,emotion,val,act,dom,text
0,3.6132,6.17,Ses05F_impro01_F000,neu,4.0,2.5,3.0,"Hi, I need an ID."
1,14.1500,19.49,Ses05F_impro01_F001,fru,2.5,3.0,3.0,"Okay, I'm sorry, but I just stood in this line..."
2,22.8500,26.90,Ses05F_impro01_F002,fru,2.5,3.0,3.5,"No, they told me-I'm sorry, but they told me t..."
3,29.9800,34.46,Ses05F_impro01_F003,fru,2.5,3.0,3.0,"What, I mean what... what's the difference? W..."
4,39.4800,42.18,Ses05F_impro01_F004,fru,2.5,3.5,3.0,Can you just-can I just get the right-
...,...,...,...,...,...,...,...,...
2153,236.5700,244.83,Ses05M_script03_2_M041,ang,1.0,4.5,5.0,"You are a vile tempered, wicked living, evil l..."
2154,244.8400,246.58,Ses05M_script03_2_M042,ang,1.0,4.5,4.5,"Oh, you're not going like this."
2155,246.5900,248.83,Ses05M_script03_2_M043,ang,1.5,4.0,4.5,"[GARBAGE] No, you're not."
2156,255.8600,260.33,Ses05M_script03_2_M044,ang,1.0,5.0,5.0,oh! Marry you again? I wouldn't marry you agai...


In [145]:
fusion_data.to_csv('./dataset/preprocess/fusion_data.csv', index=False)

## 2. Text data preprocessing

### 2.1 Data input

In [148]:
fusion_data = fusion_data.reset_index()
fusion_data = fusion_data.drop(['index'],axis=1)

In [151]:
fusion_data

Unnamed: 0,start_time,end_time,wav_file,emotion,val,act,dom,text
0,3.6132,6.17,Ses05F_impro01_F000,neu,4.0,2.5,3.0,"Hi, I need an ID."
1,14.1500,19.49,Ses05F_impro01_F001,fru,2.5,3.0,3.0,"Okay, I'm sorry, but I just stood in this line..."
2,22.8500,26.90,Ses05F_impro01_F002,fru,2.5,3.0,3.5,"No, they told me-I'm sorry, but they told me t..."
3,29.9800,34.46,Ses05F_impro01_F003,fru,2.5,3.0,3.0,"What, I mean what... what's the difference? W..."
4,39.4800,42.18,Ses05F_impro01_F004,fru,2.5,3.5,3.0,Can you just-can I just get the right-
...,...,...,...,...,...,...,...,...
2153,236.5700,244.83,Ses05M_script03_2_M041,ang,1.0,4.5,5.0,"You are a vile tempered, wicked living, evil l..."
2154,244.8400,246.58,Ses05M_script03_2_M042,ang,1.0,4.5,4.5,"Oh, you're not going like this."
2155,246.5900,248.83,Ses05M_script03_2_M043,ang,1.5,4.0,4.5,"[GARBAGE] No, you're not."
2156,255.8600,260.33,Ses05M_script03_2_M044,ang,1.0,5.0,5.0,oh! Marry you again? I wouldn't marry you agai...


### 2.2 Stopword Apply

In [152]:
#english(Stopwords file)
english_file = open('./dataset/english',mode='r')
english_data = english_file.readlines()
english_text = []

for i in english_data:
    english_text.append(i[:-1])

english_text

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [158]:
# Stopwords Removement
def clean_text(text):
    #기호문자 제거
    talk_text = re.sub("[^a-zA-Z]", " ", text)
    # 소문자로 변경 후 분리
    word_tokens = talk_text.lower().split()
    #표제어 추출을 하기 위한 함수
    le = WordNetLemmatizer()
    #english_text 안의 영어 불용어를 집합으로 변화
    stop_words = set(english_text)
    word_tokens = [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    # 토큰화된 문장을 하나의 문장으로 변화
    cleaned_text = " ".join(word_tokens)
    
    # \W:비문자 \b:단어 경계(\w와 \W의 경계) \w:문자
    # compile 정규표현식을 컴파일 하는 함수
    shortword = re.compile(r'\W*\b\w{1,2}\b')
    cleaned_text = shortword.sub('', cleaned_text)
    
    return cleaned_text

# Stopwords Apply Content 열에 clean_review 함수를 통해 apply
fusion_data['text'] = fusion_data['text'].apply(clean_text)
fusion_data.head

<bound method NDFrame.head of       start_time  end_time                wav_file emotion  val  act  dom  \
0         3.6132      6.17     Ses05F_impro01_F000     neu  4.0  2.5  3.0   
1        14.1500     19.49     Ses05F_impro01_F001     fru  2.5  3.0  3.0   
2        22.8500     26.90     Ses05F_impro01_F002     fru  2.5  3.0  3.5   
3        29.9800     34.46     Ses05F_impro01_F003     fru  2.5  3.0  3.0   
4        39.4800     42.18     Ses05F_impro01_F004     fru  2.5  3.5  3.0   
...          ...       ...                     ...     ...  ...  ...  ...   
2153    236.5700    244.83  Ses05M_script03_2_M041     ang  1.0  4.5  5.0   
2154    244.8400    246.58  Ses05M_script03_2_M042     ang  1.0  4.5  4.5   
2155    246.5900    248.83  Ses05M_script03_2_M043     ang  1.5  4.0  4.5   
2156    255.8600    260.33  Ses05M_script03_2_M044     ang  1.0  5.0  5.0   
2157    260.3400    266.35  Ses05M_script03_2_M045     ang  1.0  5.0  5.0   

                                             

In [159]:
tokenizer = RegexpTokenizer(r'\w+')
# raw_data의 'Content'열의 타입을 str 형태로 변경
fusion_data['text'] = fusion_data['text'].astype('str')
# raw_data의 'Content'열의 각 문장들을 단어로 tokenize
fusion_data['text'] = fusion_data['text'].apply(tokenizer.tokenize)

In [161]:
content_list = np.array(fusion_data['text'].tolist())
print(content_list)

[list(['need']) list(['okay', 'sorry', 'stood', 'line', 'hour', 'way'])
 list(['told', 'sorry', 'told', 'line']) ... list(['garbage'])
 list(['marry', 'marry', 'came', 'crawling', 'bended', 'knee'])
 list(['wicked', 'little', 'vampire', 'pray', 'god', 'never', 'set', 'eye', 'long', 'live'])]


### 2.3 Data Tokenizing & word_indexing

In [162]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(content_list)

text_Xdata = np.array(tokenizer.texts_to_sequences(content_list))

print(text_Xdata)

NameError: name 'Tokenizer' is not defined