In [10]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# 텍스트 전처리 함수
def clean_text(text):
    text = text.lower()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# MoodyLyrics 데이터 로드
data_path = '../data/ml_balanced.xlsx'
df = pd.read_excel(data_path)

# 텍스트와 레이블 추출 및 전처리
texts = (df['Artist'].astype(str) + ' ' + df['Title'].astype(str)).apply(clean_text).tolist()
labels = df['Mood'].tolist()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
import random
from nltk.corpus import wordnet

def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    return sentence

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def augment_text(texts, labels, num_augmented):
    augmented_texts = []
    augmented_labels = []

    for _ in range(num_augmented):
        for text, label in zip(texts, labels):
            augmented_texts.append(synonym_replacement(text, n=2))
            augmented_labels.append(label)
    
    return augmented_texts, augmented_labels

# 텍스트 증강
augmented_texts, augmented_labels = augment_text(texts, labels, num_augmented=5)

# 증강된 데이터 추가
texts.extend(augmented_texts)
labels.extend(augmented_labels)

In [12]:
# 토크나이저 정의 및 텍스트 토큰화
num_words = 20000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# 패딩 처리
maxlen = 200
X = pad_sequences(sequences, maxlen=maxlen)

In [13]:
# 레이블 인코딩
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
y = to_categorical(y)

# 클래스 수 정의
num_classes = y.shape[1]

In [14]:
# GloVe 임베딩 로드
embeddings_index = {}
glove_path = '../data/glove.6B.100d.txt'  # GloVe 파일 경로
with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# 임베딩 매트릭스 생성
embedding_dim = 100
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [15]:
# 모델 정의
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model.add(Conv1D(filters=256, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Conv1D(filters=256, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(256)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# 모델 컴파일
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

model.summary()



In [16]:
# 학습용 데이터와 검증용 데이터로 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 조기 종료 콜백 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 모델 학습
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_val, y_val), callbacks=[early_stopping])


Epoch 1/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 288ms/step - accuracy: 0.2377 - loss: 1.4013 - val_accuracy: 0.2362 - val_loss: 1.3878
Epoch 2/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 274ms/step - accuracy: 0.2638 - loss: 1.3887 - val_accuracy: 0.2404 - val_loss: 1.3869
Epoch 3/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 273ms/step - accuracy: 0.2678 - loss: 1.3854 - val_accuracy: 0.2996 - val_loss: 1.3723
Epoch 4/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 273ms/step - accuracy: 0.2903 - loss: 1.3806 - val_accuracy: 0.3533 - val_loss: 1.3451
Epoch 5/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 281ms/step - accuracy: 0.3109 - loss: 1.3637 - val_accuracy: 0.3512 - val_loss: 1.3284
Epoch 6/100
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 279ms/step - accuracy: 0.3525 - loss: 1.3174 - val_accuracy: 0.3804 - val_loss: 1.2946
Epoc

In [17]:
# 모델 평가
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')

[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 0.7990 - loss: 0.6481
Validation Loss: 0.6506103277206421
Validation Accuracy: 0.7983333468437195


In [19]:
import os

# 모델 저장 경로
save_dir = os.path.join(os.getcwd(), 'models')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

model_path = os.path.join(save_dir, 'text_classification_model.keras')

# 모델 저장
model.save(model_path)
print(f'Model saved at: {model_path}')

Model saved at: c:\music_recommender\scripts\models\text_classification_model.keras
