In [None]:
# Apply LOOCV XGBoost to the dataset
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error

In [None]:
# Load the dataset

# Load lyrics features from npy file
lyrics_features = np.load('features/lyrics_features.npy')

# Load audio features from npy file
audio_features = np.load('features/features_audio.npy')

# Load metadata features from npy file
metadata_features = np.load('features/X_metadata.npy')

# Load labels from npy file
labels = np.load('features/labels_audio.npy') # all have the same labels

In [None]:
lyrics_features

In [None]:
audio_features

In [None]:
metadata_features

In [None]:
# Concatenate all features
X = np.concatenate((lyrics_features, audio_features, metadata_features), axis=1)

In [None]:
# Optimize with Optuna - objective is F1-Score
import optuna
from sklearn.metrics import f1_score
from tqdm import tqdm

def objective(trial):
    
    # Define the search space

    # Number of estimators
    n_estimators = 1000
    # Learning rate
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 0.1, log=True)
    # Maximum depth
    max_depth = trial.suggest_int('max_depth', 5, 8)
    # Subsample
    subsample = trial.suggest_float('subsample', 0.5, 0.7)
    # Alpha - l1 regularization
    alpha = trial.suggest_float('alpha', 0, 100)

    
    # Train LOOCV XGBoost on the dataset to predict the labels
    
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    
    y_pred = []
    y_true = []

        
    for train_index, test_index in tqdm(loo.split(X), total=len(labels)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        
        # Train XGBoost Classifier
        model = xgb.XGBClassifier(objective='multi:softmax', n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, subsample=subsample, alpha=alpha)
        model.fit(X_train, y_train)
        
        # Predict class labels
        y_pred.append(model.predict(X_test)[0])
        y_true.append(y_test[0])
    
    # Convert predictions and true labels to numpy arrays
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
        
    # Convert predictions and true labels to numpy arrays
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    
    # Calculate F1-Score
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    return f1

In [None]:
# Optimize the model
study = optuna.create_study(direction='maximize', study_name='early_fusion_v2', storage='sqlite:///early_fusion.db', load_if_exists=True)
study.optimize(objective, n_trials=3)

In [None]:
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import xgboost as xgb
from sklearn.model_selection import LeaveOneOut
from tqdm import tqdm
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import LeaveOneOut
from itertools import cycle

# Ensure X and labels are available before running
loo = LeaveOneOut()
loo.get_n_splits(X)

y_true = []  # True labels
y_score = []  # Predicted probabilities

for train_index, test_index in tqdm(loo.split(X), total=len(labels)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    # Train XGBoost Classifier
    model = xgb.XGBClassifier(objective='multi:softmax', n_estimators=1000)
    model.fit(X_train, y_train)
    
    # Predict probabilities instead of class labels
    probas = model.predict_proba(X_test)[0]  # Probability distribution across classes
    
    y_score.append(probas)
    y_true.append(y_test[0])

# Convert to numpy arrays
y_score = np.array(y_score)
y_true = np.array(y_true)

# Print classification results
print("Classification Report:")
print(classification_report(y_true, np.argmax(y_score, axis=1)))

print("Confusion Matrix:")
print(confusion_matrix(y_true, np.argmax(y_score, axis=1)))

In [None]:
# ====== AUC-ROC Curve Plotting ======

# Binarize the true labels for multi-class AUC-ROC computation
n_classes = len(np.unique(y_true))
y_true_bin = label_binarize(y_true, classes=np.arange(n_classes))

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

np.save("auc_roc/fpr_early.npy", fpr)
np.save("auc_roc/tpr_early.npy", tpr)
np.save("auc_roc/auc_roc_early.npy", roc_auc)

# Plot all ROC curves
plt.figure(figsize=(8, 6))
colors = cycle(['blue', 'red', 'green', 'purple', 'orange', 'brown', 'pink', 'gray'])

for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve for class {i} (area = {roc_auc[i]:0.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
# Use tableu style
plt.style.use('tableau-colorblind10')
plt.ylabel('True Positive Rate')
plt.title('AUC-ROC Curve for Multi-Class Classification')
plt.legend(loc="lower right")
plt.show()


In [None]:
genre_dict = {
    0: 'Country',
    1: 'Hip-Hop',
    2: 'Indie',
    3: 'Jazz',
    4: 'Metal',
    5: 'Pop',
    6: 'Rap',
    7: 'Rock'
}

In [None]:
import joblib
# Save the model
joblib.dump(model, 'models/early_fusion_xgboost.joblib')

In [None]:
# Plot MSE for each class with genre names
import matplotlib.pyplot as plt
plt.bar(genre_dict.values(), mse)
plt.xlabel('Genre')
plt.ylabel('Mean Squared Error')
plt.title('Mean Squared Error for each genre')
plt.show()

In [None]:
from matplotlib.colors import LinearSegmentedColormap

colors = ["white", "#455681"]  # White to #455681 gradient
custom_cmap = LinearSegmentedColormap.from_list("custom_white_to_blue", colors)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def plot_single_confusion_matrix(test_labels, test_preds, label_names):
    cm = confusion_matrix(test_labels, test_preds)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    # Use latex
    plt.rc('text', usetex=True)
    plt.rc('font', family='serif')

    # Plot Non-Normalized
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title("Confusion Matrix (Non-Normalized)")
    plt.ylabel("True Genre")
    plt.xlabel("Predicted Genre")
    plt.show()



    colors = ["white", "#455681"]  # White to #455681 gradient
    custom_cmap = LinearSegmentedColormap.from_list("custom_white_to_blue", colors)
    # Normalize the confusion matrix# 
    conf_matrix_norm = cm_normalized
    
    plt.rc('text', usetex=True)
    plt.rc('font', family='serif')
    # Plot the normalized confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix_norm, cmap=custom_cmap, annot=True, fmt=".2f", xticklabels=genre_dict.values(),
                yticklabels=genre_dict.values(), vmax=1.0)
    plt.xlabel("Predicted", fontdict={"fontsize": 12})
    plt.ylabel("True", fontdict={"fontsize": 12})
    plt.tight_layout()
    plt.savefig("confusion_matrix_normalized_loocv.eps", dpi=300)
    plt.show()

# Plot single confusion matrix
plot_single_confusion_matrix(y_true, y_pred, list(genre_dict.values()))




In [None]:
import joblib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle

# Load the trained XGBoost model
model = joblib.load('models/early_fusion_xgboost.joblib')

# Assuming X and labels are available
y_true = np.array(labels)  # True labels
y_pred = model.predict(X)  # Get predicted probabilities

In [None]:
# Print classification report
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

In [None]:
# Save the predicted labels
np.save('features/labels_audio_pred.npy', y_pred)

In [None]:
# Evaluate the model
import matplotlib.pyplot as plt
plt.scatter(y_true, y_pred)
plt.xlabel('True')
plt.ylabel('Predicted')
plt.title('True vs Predicted')
plt.show()

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred)
cm