In [None]:
import joblib
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

### Test trained model on new dataset (on metadata)

In [None]:
# Load the trained model from joblib file

# # Parameters from Optuna 
# n_estimators = 80
# max_depth = 12
# learning_rate = 0.1684744342969461
# gamma = 0.2935882001439162
# min_child_weight = 4
# 
# model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, gamma=gamma, min_child_weight=min_child_weight, random_state=42)

model = joblib.load('models/xgb_metadata_v1_8_genres_smote_normalized.joblib')

# Load the new dataset
df = pd.read_csv('data/fma_cut100_echonest_lyrics_fake_country.csv', header=[0, 1])

In [None]:
df.sample(3)

In [None]:
df = df[df['track', 'language_code'].isin(['en', np.nan])]

In [None]:
# Select the features from the new dataset
features = [('track', 'danceability'), ('track', 'energy'), ('track', 'speechiness'), ('track', 'acousticness'), ('track', 'instrumentalness'), ('track', 'liveness'), ('track', 'valence'), ('track', 'tempo'), ('track', 'duration'), ('album', 'year_released')]

In [None]:
df[('album','year_released')] = df[('album', 'date_released')].str.extract(r'(\d{4})')

In [None]:
df = df[features + [('track', 'one_genre')]]

In [None]:
# Drop first level of header
df.columns = df.columns.droplevel(0)

In [None]:
df

In [None]:
# Ensure all values in 'year_released' are numeric
df['year_released'] = pd.to_numeric(df['year_released'], errors='coerce')
df['year_released'] = df['year_released'].fillna(df.groupby('one_genre')['year_released'].transform('median'))



In [None]:
df.sort_values('one_genre', inplace=True)

In [None]:
features = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration','year_released']

In [None]:
from sklearn.preprocessing import StandardScaler

X = df[features]
y = df['one_genre']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Normalize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [None]:
X

In [None]:
y

In [None]:
# Save features and labels
np.save('features/X_metadata.npy', X)
np.save('features/y_metadata.npy', y)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
y

In [None]:
label_encoder.classes_

In [None]:
# Sample X
X[:5]

In [None]:
# Sample y
y[:5]

In [None]:
y_pred = model.predict(X)
# Get probabilities for each class for each sample
y_pred_proba = model.predict_proba(X)
print(classification_report(y, y_pred, target_names=list(label_encoder.inverse_transform([0,1,2,3,4,5,6,7]))))
accuracy_score(y, y_pred)

In [None]:
# Get probabilities, and the predicted class for 1st sample
print(y_pred_proba[0])
print(y_pred[0])
print(y[0])

# Get as genres not as numbers
print(label_encoder.inverse_transform([y[0]]))
print(label_encoder.inverse_transform([y_pred[0]]))

In [None]:
# Use latex
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
# Plot confusion matrix with genre names
genre_names = list(label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7]))
cm = confusion_matrix(y, y_pred)
sns.heatmap(cm, annot=True, xticklabels=genre_names, yticklabels=genre_names, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')

plt.show()

from matplotlib.colors import LinearSegmentedColormap

colors = ["#FFFFFF", "#455681"]  # White to #455681 gradient
custom_cmap = LinearSegmentedColormap.from_list("custom_white_to_blue", colors)

# Normalize the confusion matrix# 
conf_matrix_norm = cm / cm.sum(axis=1)[:, np.newaxis]

plt.rc('text', usetex=True)
plt.rc('font', family='serif')
# Plot the normalized confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_norm, cmap=custom_cmap, annot=True, fmt=".2f", xticklabels=genre_names, yticklabels=genre_names, vmax=1.0)
plt.xlabel("Predicted", fontdict={"fontsize": 12})
plt.ylabel("True", fontdict={"fontsize": 12})
plt.tight_layout()
plt.savefig("confusion_matrix_normalized_multi_metadata.eps", dpi=300)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize
import numpy as np

# Assuming `y_test` and `y_pred` are already defined
# Binarize the labels for multi-class ROC computation
classes = list(label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7]))
y_test_bin = label_binarize(y, classes=range(len(classes)))
y_pred_prob = model.predict_proba(X)  # Get probabilities for ROC computation

# Compute ROC curve and ROC area for each class
fpr = {}
tpr = {}
roc_auc = {}

for i in range(len(classes)):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])


np.save("fpr_meta_multi.npy", fpr)
np.save("tpr_meta_multi.npy", tpr)
np.save("roc_auc_meta_multi.npy", roc_auc)

# Save label
np.save("labels_meta.npy", classes)

# Plot ROC curve for each class
plt.figure(figsize=(8, 6))
for i in range(len(classes)):
    plt.plot(fpr[i], tpr[i], label=f"Class {classes[i]} (AUC = {roc_auc[i]:.2f})")

# Plot diagonal line for random guess
plt.plot([0, 1], [0, 1], 'k--')

plt.title("Multi-Class ROC Curve")
plt.xlabel("False Positive Rate", fontdict={"fontsize": 12})
plt.ylabel("True Positive Rate", fontdict={"fontsize": 12})
plt.legend(loc="lower right", prop={"size": 12})  # Adjust legend location if needed
plt.yticks(fontsize=11)
plt.xticks(fontsize=11)
plt.style.use('fast')
plt.tight_layout()
plt.grid()
plt.savefig("roc_curve_metadata.png", dpi=300)
plt.show()
