In [None]:
import pandas as pd
import numpy as np
import torch
from llm2vec import LLM2Vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout, concatenate
from keras.utils import to_categorical
from keras.callbacks import LearningRateScheduler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from keras.optimizers import Adam

# 下载 NLTK 资源
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# 数据集路径
dataset_path = "D:/Project/yxz1225/data/data_moody/MoodyLyrics4Q.csv"

# 加载数据
data = pd.read_csv(dataset_path)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)  # 打乱数据
data.head()

In [None]:
# 数据预处理
def preprocess_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    cleaned = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(cleaned)

data['processed_lyrics'] = data['lyrics'].apply(preprocess_text)
data[['lyrics', 'processed_lyrics']].head()

In [None]:
# 处理类别标签
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(data['mood'])
labels = to_categorical(encoded_labels)

# 检查类别
label_encoder.classes_

In [None]:
# 使用 LLM2Vec 获取嵌入
device = "cuda" if torch.cuda.is_available() else "cpu"

l2v = LLM2Vec.from_pretrained(
    "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
    peft_model_name_or_path="McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-unsup-simcse",
    device_map=device,
    torch_dtype=torch.bfloat16,
)

def get_llm2vec_embedding(text):
    return l2v.encode(text)

data['llm2vec_embedding'] = data['processed_lyrics'].apply(get_llm2vec_embedding)
X_llm2vec = np.vstack(data['llm2vec_embedding'].values)  # 形状: (num_samples, embedding_dim)
embedding_dim = X_llm2vec.shape[1]  # 获取 LLM2Vec 维度

In [None]:
# 划分训练集
x_train, x_val, y_train, y_val = train_test_split(X_llm2vec, labels, test_size=0.2, random_state=42, stratify=labels)

print(f"训练数据形状: {x_train.shape}")
print(f"测试数据形状: {x_val.shape}")

In [None]:
# 构建 CNN 分类模型
input_layer = Input(shape=(embedding_dim,), name='LLM2Vec_Input')

dense1 = Dense(128, activation='relu', name='Dense1')(input_layer)
dropout1 = Dropout(0.2, name='Dropout1')(dense1)
dense2 = Dense(64, activation='relu', name='Dense2')(dropout1)
dropout2 = Dropout(0.2, name='Dropout2')(dense2)
output_layer = Dense(len(labels[0]), activation='softmax', name='Output')(dropout2)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model.summary()

In [None]:
# 训练模型
history = model.fit(x_train, y_train,
                    batch_size=16,
                    epochs=20,
                    validation_data=(x_val, y_val),
                    verbose=1)

In [None]:
# 评估模型
y_pred = model.predict(x_val)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_val, axis=1)

print(classification_report(y_true, y_pred_classes, target_names=label_encoder.classes_))
print(f'F1 Score: {f1_score(y_true, y_pred_classes, average="weighted"):.2f}')