In [1]:
from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
# from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

import np_utils
from keras.utils import to_categorical
import pandas as pd
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import tensorflow as tf
tf.__version__

'2.10.1'

In [3]:
current_path=os.getcwd().replace("\\", "/").replace("c:", "C:")
data_path=current_path+'/data/'
model_path=current_path+"/model/"

In [4]:
final_df=pd.read_csv(data_path+'final_df.csv')

In [5]:
final_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,479,480,481,482,483,484,485,wav_id,final_label,sentence
0,0.142637,0.688733,0.665087,0.551895,0.554083,0.577122,0.589633,0.547716,0.479773,0.467628,...,0.000592,0.0005972413,0.0004031004,0.000356,0.000352,0.000134,1.10898e-05,5f6892fff8fac448cc0a5b44,sadness,비싼 건 못 샀어. 알바비를 모아서 산 거거든.
1,0.120067,0.571521,0.557885,0.552126,0.541684,0.513932,0.540703,0.624245,0.616377,0.64137,...,1e-06,5.569921e-07,6.353915e-07,5e-06,9e-06,7e-06,8.042485e-07,5f0604b9b140144dfcff01f7,sadness,내가 하고있는 일에 진도가 잘 나가지지 않아서 일하고 싶은 마음이 다 사라졌어. 무...
2,0.060619,0.462079,0.468565,0.486078,0.534205,0.458072,0.390353,0.400268,0.544558,0.692785,...,5e-06,4.966622e-06,4.113252e-06,1.1e-05,1.1e-05,6e-06,4.186585e-07,5f0f24fcb140144dfcff44c6,sadness,좋은 생각이네 강남역 보다는 그 근처에 있는 한가한 공원으로 바꾸는게 나을 것 같아.\t
3,0.056708,0.652244,0.713408,0.730681,0.730744,0.700714,0.686664,0.632844,0.579585,0.620094,...,3e-06,8.719081e-06,2.982148e-05,8e-05,3.4e-05,1.1e-05,4.901741e-07,5f0122f3704f492ee12565b8,sadness,지난번에도 미뤘던 약속이야. 더 이상 미루기에는 너무 미안해.
4,0.05613,0.466003,0.363453,0.37938,0.392485,0.354566,0.3482,0.404133,0.50008,0.672009,...,3e-06,3.572638e-06,4.523758e-06,7e-06,1.1e-05,8e-06,5.580449e-07,5f0c741bb140144dfcff2f19,sadness,오늘이 발표날인데 나한테 연락이 없더라고. 그래서 알아봤더니 내 이름이 없대.


In [6]:
class text_embedding():
  def __init__(self, model_name):
    self.model_name = model_name

  def fit(self, X, y=None):
        return self

  def transform(self, X):
        embedding_model = SentenceTransformer(self.model_name)
        embedding_vec = embedding_model.encode(X['sentence'])
        X_val = np.concatenate((X.drop(['final_label', 'wav_id', 'sentence'], axis = 1), embedding_vec), axis = 1)
        return X_val
        
def custom_model(x_train):
  model=Sequential()
  model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(x_train.shape[1], 1)))
  model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

  model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
  model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

  model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
  model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
  model.add(Dropout(0.2))

  model.add(Conv1D(64, kernel_size=5, strides=1, padding='same', activation='relu'))
  model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

  model.add(Flatten())
  model.add(Dense(units=32, activation='relu'))
  model.add(Dropout(0.3))

  model.add(Dense(units=6, activation='softmax'))
  model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

  #model.summary()
  return model

In [9]:
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001) #learning rate 조절
# 선택한 모델만 다시 학습 (텍스트 + 음성)
scaler = StandardScaler()
encoder = OneHotEncoder()

model_name="jhgan/ko-sbert-multitask"
Y = final_df['final_label'].values
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

txt_embed = text_embedding(model_name = model_name)
X = txt_embed.transform(final_df)

x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [10]:
# model_name='jhgan/ko-sbert-multitask'
# 정확도: 0.6979054808616638

# model_name="snunlp/KR-SBERT-V40K-klueNLI-augSTS"
# 정확도: 0.6732008457183838

In [11]:
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

model = custom_model(x_train)
history=model.fit(x_train, y_train, batch_size=64, epochs=50, validation_data=(x_test, y_test), callbacks=[rlrp])

test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
print("Pre-trained Model: ", model_name)
print("Test Accuracy: ",test_acc)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Pre-trained Model:  jhgan/ko-sbert-multitask
Test Accuracy:  0.6930719614028931


In [12]:
model.save(model_path+'my_model.h5')

In [13]:
with open(model_path+'text_model_name.txt', 'w') as file:
    file.write(model_name)

In [14]:
import joblib
joblib.dump(encoder, model_path+'encoder.pkl')
joblib.dump(scaler, model_path+'scaler.pkl')

['C:/Users/82105/Desktop/코드/korean_emotions/model/scaler.pkl']