In [1]:
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, BatchNormalization, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical


# Função para carregar embeddings de áudio com rótulo especial para gêneros com pouca incidência de letras
def load_embeddings_and_labels_with_special_genre(embeddings_dir, lyrics_labels, max_frames=156):
    embeddings, labels, augmented_embeddings, augmented_labels = [], [], [], []

    # Gêneros de áudio que não têm correspondência nas letras
    genres_with_no_lyrics = set(os.listdir(embeddings_dir)) - set(lyrics_labels)

    for genre in os.listdir(embeddings_dir):
        genre_path = os.path.join(embeddings_dir, genre)
        if os.path.isdir(genre_path):
            for file_name in os.listdir(genre_path):
                if file_name.endswith(".npy"):
                    file_path = os.path.join(genre_path, file_name)
                    data = np.load(file_path, allow_pickle=True).item()
                    embedding = data["embeddings"]
                    label = genre

                    # Ajustar tamanho dos embeddings
                    if embedding.shape[0] < max_frames:
                        embedding = np.pad(embedding, ((0, max_frames - embedding.shape[0]), (0, 0)), mode='constant')
                    else:
                        embedding = embedding[:max_frames, :]

                    # Se o gênero de áudio não tem correspondência nas letras, atribuir rótulo especial
                    if genre not in lyrics_labels:
                        label = "outros"  # Rótulo especial

                    if "_augmented" in file_name:
                        augmented_embeddings.append(embedding)
                        augmented_labels.append(label)
                    else:
                        embeddings.append(embedding)
                        labels.append(label)

    embeddings = np.array(embeddings) / np.max(np.abs(embeddings))
    augmented_embeddings = np.array(augmented_embeddings) / np.max(np.abs(augmented_embeddings))

    return embeddings, labels, augmented_embeddings, augmented_labels


def load_lyrics_embeddings(embeddings_dir, metadata_csv_path, max_frames=156):
    # Carregar o CSV com os metadados das letras
    metadata_df = pd.read_csv(metadata_csv_path)
    
    # Criar um dicionário mapeando o nome do arquivo (sem o sufixo '_embedding.npy') para o gênero
    file_to_genre = {row['filename'].replace('_embedding.npy', '.npy'): row['genre'].lower() for _, row in metadata_df.iterrows()}

    embeddings, labels = [], []

    for file_name in os.listdir(embeddings_dir):
        if file_name.endswith("_embedding.npy"):
            file_path = os.path.join(embeddings_dir, file_name)
            embedding = np.load(file_path, allow_pickle=True)

            # Ajustar formato e tamanho
            if embedding.ndim == 1:
                embedding = np.expand_dims(embedding, axis=0)
            if embedding.shape[0] < max_frames:
                embedding = np.pad(embedding, ((0, max_frames - embedding.shape[0]), (0, 0)), mode='constant')
            else:
                embedding = embedding[:max_frames, :]

            embeddings.append(embedding)
            
            # Remover sufixo '_embedding.npy' para corresponder com o nome no CSV
            file_base_name = file_name.replace('_embedding.npy', '.npy')

            # Obter o gênero a partir do dicionário
            if file_base_name in file_to_genre:
                genre = file_to_genre[file_base_name]
                
                # Normalizar o gênero removendo espaços e convertendo para minúsculas
                genre_normalized = genre.replace(" ", "").lower()
                labels.append(genre_normalized)
            else:
                print(f"Gênero não encontrado para {file_name}")

    embeddings = np.array(embeddings)
    embeddings /= np.max(np.abs(embeddings))  # Normalizar os embeddings

    return embeddings, labels

# Caminhos dos diretórios
audio_embeddings_dir = '/home/jmayos/songtest/songresult_gtzan'
lyrics_embeddings_dir = '/home/jmayos/songtest/embeddings'
metadata_csv_path = '/home/jmayos/songtest/embeddings/metadata.csv'  # Substitua com o caminho correto


lyrics_embeddings, lyrics_labels = load_lyrics_embeddings(lyrics_embeddings_dir, metadata_csv_path)  # Correção aqui
audio_embeddings, audio_labels, audio_augmented_embeddings, audio_augmented_labels = load_embeddings_and_labels_with_special_genre(audio_embeddings_dir, lyrics_labels)

# Combinar dados originais e aumentados
audio_embeddings = np.concatenate([audio_embeddings, audio_augmented_embeddings], axis=0)
audio_labels += audio_augmented_labels

valid_genres = {'pop', 'disco', 'hiphop', 'metal', 'rock', 'reggae'}
audio_labels_mapped = [label if label in valid_genres else 'outros' for label in audio_labels]

# Validar correspondência de rótulos
if set(audio_labels_mapped) != set(lyrics_labels):
    print(f"Aviso: Os gêneros de áudio e texto não correspondem. Ajustando para incluir gêneros especiais.")

# Codificar rótulos
label_encoder = LabelEncoder()
audio_labels_mapped = label_encoder.fit_transform(audio_labels_mapped)  # Use os rótulos mapeados
audio_labels_mapped = to_categorical(audio_labels_mapped)

# Dividir dados em conjuntos de treino e teste
X_train_audio, X_test_audio, X_train_lyrics, X_test_lyrics, y_train, y_test = train_test_split(
    audio_embeddings, lyrics_embeddings, audio_labels_mapped, test_size=0.2, random_state=42
)

# Calcular pesos das classes
class_weights = compute_class_weight(
    'balanced', classes=np.unique(np.argmax(y_train, axis=1)), y=np.argmax(y_train, axis=1)
)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}



# Criar o modelo multimodal
def create_multimodal_cnn(input_shape_lyrics, input_shape_audio, num_classes):
    lyrics_input = Input(shape=input_shape_lyrics, name="lyrics_input")
    x_lyrics = Conv1D(64, 3, activation='relu')(lyrics_input)
    x_lyrics = BatchNormalization()(x_lyrics)
    x_lyrics = MaxPooling1D(pool_size=2)(x_lyrics)
    x_lyrics = Dropout(0.3)(x_lyrics)
    x_lyrics = Flatten()(x_lyrics)

    audio_input = Input(shape=input_shape_audio, name="audio_input")
    x_audio = Conv1D(64, 3, activation='relu')(audio_input)
    x_audio = BatchNormalization()(x_audio)
    x_audio = MaxPooling1D(pool_size=2)(x_audio)
    x_audio = Dropout(0.3)(x_audio)
    x_audio = Flatten()(x_audio)

    combined = concatenate([x_lyrics, x_audio])
    x = Dense(128, activation='relu')(combined)
    x = Dropout(0.4)(x)
    output = Dense(num_classes, activation='softmax')(x)

    return Model(inputs=[lyrics_input, audio_input], outputs=output)


input_shape_lyrics = (X_train_lyrics.shape[1], X_train_lyrics.shape[2])
input_shape_audio = (X_train_audio.shape[1], X_train_audio.shape[2])
num_classes = y_train.shape[1]

multimodal_model = create_multimodal_cnn(input_shape_lyrics, input_shape_audio, num_classes)
multimodal_model.compile(optimizer=Adam(0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Treinar o modelo
history = multimodal_model.fit(
    [X_train_lyrics, X_train_audio], y_train,
    validation_data=([X_test_lyrics, X_test_audio], y_test),
    epochs=50, batch_size=16, class_weight=class_weight_dict,
    callbacks=[EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)]
)

# Avaliar o modelo
loss, accuracy = multimodal_model.evaluate([X_test_lyrics, X_test_audio], y_test)
print(f"Loss: {loss}, Accuracy: {accuracy}")

# Relatório e matriz de confusão
y_pred = np.argmax(multimodal_model.predict([X_test_lyrics, X_test_audio]), axis=1)
y_true = np.argmax(y_test, axis=1)

print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

conf_matrix = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

2024-12-01 23:45:16.794049: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-01 23:45:16.843409: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-01 23:45:17.151820: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-01 23:45:17.157057: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


: 

In [1]:
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

# Aumentar as amostras para 200/200
target_samples = 200

# Função para aumentar os embeddings com ruído gaussiano
def augment_embeddings(embeddings, n_augments=3, noise_level=0.01):
    augmented_embeddings = []
    for embed in embeddings:
        for _ in range(n_augments):
            noisy_embed = embed + np.random.normal(0, noise_level, embed.shape)
            augmented_embeddings.append(noisy_embed)
    return np.array(augmented_embeddings)

# Determinar o número de aumentos necessários
n_augments_audio = (target_samples // len(audio_embeddings)) - 1
n_augments_lyrics = (target_samples // len(lyrics_embeddings)) - 1

# Aumentar os embeddings de áudio e letras
augmented_audio = augment_embeddings(audio_embeddings, n_augments=n_augments_audio)
augmented_lyrics = augment_embeddings(lyrics_embeddings, n_augments=n_augments_lyrics)

# Combinar os originais com os aumentados
audio_embeddings_extended = np.vstack([audio_embeddings, augmented_audio])
lyrics_embeddings_extended = np.vstack([lyrics_embeddings, augmented_lyrics])

# Ajustar os rótulos
num_original = len(audio_labels_mapped)
audio_labels_extended = np.tile(audio_labels_mapped, (n_augments_audio + 1, 1))

# Garantir que os tamanhos sejam consistentes
audio_embeddings_extended = audio_embeddings_extended[:target_samples]
lyrics_embeddings_extended = lyrics_embeddings_extended[:target_samples]
audio_labels_extended = audio_labels_extended[:target_samples]

# Dividir os dados em conjuntos de treino e teste
X_train_audio, X_test_audio, X_train_lyrics, X_test_lyrics, y_train, y_test = train_test_split(
    audio_embeddings_extended, lyrics_embeddings_extended, audio_labels_extended,
    test_size=0.2,
    stratify=np.argmax(audio_labels_extended, axis=1),
    random_state=42
)

# Recalcular os pesos das classes após balanceamento
class_weights = compute_class_weight(
    'balanced', classes=np.unique(np.argmax(y_train, axis=1)), y=np.argmax(y_train, axis=1)
)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Confirmar se há balanceamento nos rótulos
unique, counts = np.unique(np.argmax(y_train, axis=1), return_counts=True)
print(f"Distribuição de classes no conjunto de treino: {dict(zip(unique, counts))}")

# Ajustar o modelo multimodal com regularização e camadas otimizadas
def create_multimodal_cnn(input_shape_lyrics, input_shape_audio, num_classes):
    from tensorflow.keras.layers import Input, Conv1D, BatchNormalization, MaxPooling1D, Flatten, Dense, Dropout, concatenate
    from tensorflow.keras.regularizers import l2
    from tensorflow.keras.models import Model

    # Processamento de embeddings de letras
    lyrics_input = Input(shape=input_shape_lyrics, name="lyrics_input")
    x_lyrics = Conv1D(128, 3, activation='relu', kernel_regularizer=l2(0.001))(lyrics_input)
    x_lyrics = BatchNormalization()(x_lyrics)
    x_lyrics = MaxPooling1D(pool_size=2)(x_lyrics)
    x_lyrics = Dropout(0.5)(x_lyrics)
    x_lyrics = Flatten()(x_lyrics)

    # Processamento de embeddings de áudio
    audio_input = Input(shape=input_shape_audio, name="audio_input")
    x_audio = Conv1D(128, 3, activation='relu', kernel_regularizer=l2(0.001))(audio_input)
    x_audio = BatchNormalization()(x_audio)
    x_audio = MaxPooling1D(pool_size=2)(x_audio)
    x_audio = Dropout(0.5)(x_audio)
    x_audio = Flatten()(x_audio)

    # Combinação das saídas
    combined = concatenate([x_lyrics, x_audio])
    x = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(combined)
    x = Dropout(0.4)(x)
    output = Dense(num_classes, activation='softmax')(x)

    return Model(inputs=[lyrics_input, audio_input], outputs=output)

# Criar o modelo
input_shape_lyrics = X_train_lyrics.shape[1:]
input_shape_audio = X_train_audio.shape[1:]
num_classes = y_train.shape[1]

multimodal_model = create_multimodal_cnn(input_shape_lyrics, input_shape_audio, num_classes)
multimodal_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Treinar o modelo multimodal
history = multimodal_model.fit(
    [X_train_lyrics, X_train_audio], y_train,
    validation_data=([X_test_lyrics, X_test_audio], y_test),
    epochs=50, batch_size=16, class_weight=class_weight_dict,
    callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)]
)

# Avaliar o modelo
loss, accuracy = multimodal_model.evaluate([X_test_lyrics, X_test_audio], y_test)
print(f"Loss: {loss}, Accuracy: {accuracy}")

# Relatório e matriz de confusão
y_pred = np.argmax(multimodal_model.predict([X_test_lyrics, X_test_audio]), axis=1)
y_true = np.argmax(y_test, axis=1)

print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

conf_matrix = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


2024-12-01 22:30:07.580099: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-01 22:30:07.619073: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-01 22:30:07.951477: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-01 22:30:07.952869: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


NameError: name 'audio_embeddings' is not defined

In [None]:
import os
import pandas as pd

# Carregar o CSV com os metadados das letras
metadata_csv_path = '/home/jmayos/songtest/embeddings/metadata.csv'  # Substitua pelo caminho correto
metadata_df = pd.read_csv(metadata_csv_path)

# Criar um dicionário mapeando o nome do arquivo (sem o sufixo '_embedding.npy') para o gênero
file_to_genre = {row['filename'].replace('_embedding.npy', '.npy'): row['genre'].lower() for _, row in metadata_df.iterrows()}

# Agora, associamos os rótulos de texto (embeddings de letras) ao gênero correspondente
lyrics_labels = []
for file_name in os.listdir(lyrics_embeddings_dir):
    if file_name.endswith("_embedding.npy"):
        # Remover o sufixo '_embedding.npy' para corresponder ao nome do arquivo no CSV
        file_base_name = file_name.replace('_embedding.npy', '.npy')

        # Obter o gênero a partir do dicionário
        if file_base_name in file_to_genre:
            genre = file_to_genre[file_base_name]
            lyrics_labels.append(genre)
            print(f"Achou gênero: {genre} para {file_name}")
        else:
            print(f"Gênero não encontrado para {file_name}")

# Verificar se algum gênero não foi mapeado
if not lyrics_labels:
    raise ValueError("Nenhum gênero foi mapeado para as embeddings de letras!")

# Mostrar os gêneros mapeados para as letras
print("Gêneros de letras mapeados:", set(lyrics_labels))

# Codificar os rótulos das classes em valores numéricos
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

label_encoder = LabelEncoder()
lyrics_labels = label_encoder.fit_transform(lyrics_labels)
lyrics_labels = to_categorical(lyrics_labels)


In [None]:
print("Gêneros de áudio:", set(audio_labels))
print("Gêneros de letras:", set(lyrics_labels))
