This notebook is basically the exact same one as MGR but the processing is applied to the larger feature set extracted at the end of the data_extraction notebook.

In [None]:
# Make imports
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score, homogeneity_completeness_v_measure


In [None]:
# Load the data into dataframes
features = pd.read_csv('data/more_extracted_features.csv')
metadata = pd.read_csv('data/metadata.csv')

# Remove the tracks that were not found in the features dataset from the metadata
metadata = metadata[metadata['track_id'].isin(features['track_id'])]

print (metadata.shape)
print (features.shape)

In [None]:
# Drop the track_id column first
X = features.drop(columns=['track_id']).values

# Dimensionality reduction (PCA)
pca = PCA(n_components=X.shape[1])  # Keep all components initially
X_pca = pca.fit_transform(X)

# Calculate cumulative explained variance ratio
cumulative_explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_explained_variance_ratio) + 1), cumulative_explained_variance_ratio, marker='o', linestyle='-')
plt.title('Cumulative Explained Variance Ratio')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.grid(True)
plt.show()

cumulative_explained_variance_ratio < 0.99

In [None]:
# Dimensionality reduction using PCA
# First, reducing to 2 dimensions for visualization

# Perform PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Clustering using KMeans with 8 clusters because we know that there are 8 genres
n_clusters = 8
kmeans = KMeans(n_clusters=8)
clusters = kmeans.fit(X_pca)

# Visualize the clusters
plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters.labels_)
plt.title('PCA with KMeans Clustering')
plt.show()


In [6]:
# Evaluate clustering performance
true_labels = metadata['track_genre'].values

silhouette = silhouette_score(X_pca, clusters.labels_)
ari = adjusted_rand_score(true_labels, clusters.labels_)
nmi = normalized_mutual_info_score(true_labels, clusters.labels_)
h, c, v = homogeneity_completeness_v_measure(true_labels, clusters.labels_)

print(f"Silhouette Score: {silhouette}")
print(f"Homogeneity: {h}")
print(f"Completeness: {c}")
print(f"V-measure: {v}")
print(f"Adjusted Rand Index (ARI): {ari}")
print(f"Normalized Mutual Information (NMI): {nmi}")

Silhouette Score: 0.3371514208156057
Homogeneity: 0.10711778622582027
Completeness: 0.10989642859289397
V-measure: 0.10848931858983084
Adjusted Rand Index (ARI): 0.06312327210065587
Normalized Mutual Information (NMI): 0.10848931858983084


About the same results as with the smaller dataset, meaning the clusters are okay but not great.

We can try with 10 dimensions for clustering and see how it affects the metrics (because with 10 dimensions we have more than 99% of the variance explained)

In [7]:
# Perform PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)

# Clustering using KMeans with 8 clusters because we know that there are 8 genres
n_clusters = 8
kmeans = KMeans(n_clusters=8)
clusters = kmeans.fit(X_pca)

# We can't visualize the clusters

# Evaluate clustering performance
silhouette = silhouette_score(X_pca, clusters.labels_)
ari = adjusted_rand_score(true_labels, clusters.labels_)
nmi = normalized_mutual_info_score(true_labels, clusters.labels_)
h, c, v = homogeneity_completeness_v_measure(true_labels, clusters.labels_)

print(f"Silhouette Score: {silhouette}")
print(f"Homogeneity: {h}")
print(f"Completeness: {c}")
print(f"V-measure: {v}")
print(f"Adjusted Rand Index (ARI): {ari}")
print(f"Normalized Mutual Information (NMI): {nmi}")

Silhouette Score: 0.26173796366584634
Homogeneity: 0.1094249193641957
Completeness: 0.11238282224384505
V-measure: 0.11088414834216667
Adjusted Rand Index (ARI): 0.0656090558106795
Normalized Mutual Information (NMI): 0.11088414834216669


In [None]:
# Split our dataset in 3 parts: train, validation and test  

# First, we need to merge the features and metadata dataframes
data = pd.merge(features, metadata, on='track_id')

# Split the data into train, validation and test sets
train = data[data['set'] == 'training']
validation = data[data['set'] == 'validation']  
test = data[data['set'] == 'test']

# Sanity check
print(train.shape)
print(validation.shape)
print(test.shape)

# Drop the track_id, track_genre, genre_id and set columns
X_train = train.drop(columns=['track_id', 'track_genre', 'set', 'genre_id']).values
X_validation = validation.drop(columns=['track_id', 'track_genre', 'set', 'genre_id']).values
X_test = test.drop(columns=['track_id', 'track_genre', 'set', 'genre_id']).values

# Split the data into data and labels, labels being the track_genre column
y_train = train['track_genre'].values
y_validation = validation['track_genre'].values
y_test = test['track_genre'].values

# Sanity check
print(X_train.shape)
print(X_validation.shape)
print(X_test.shape)
print(y_train.shape)
print(y_validation.shape)
print(y_test.shape)

In [9]:
# First we want to train SVM model with 5 fold cross validation
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Merge the train and validation sets for cross validation
X_cross_validation = np.concatenate((X_train, X_validation))
y_cross_validation = np.concatenate((y_train, y_validation))

'''
# Use PCA to reduce the dimensionality of the data
pca = PCA(n_components=5)
X_cross_validation = pca.fit_transform(X_cross_validation)
X_test = pca.transform(X_test)
'''

# Standardize features by removing the mean and scaling to unit variance.
scaler = skl.preprocessing.StandardScaler(copy=False)
scaler.fit_transform(X_cross_validation)
scaler.transform(X_test)

# Define the parameter grid
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3, cv=5)

# Fit the model
grid.fit(X_cross_validation, y_cross_validation)

# Print the best parameters
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)
#print(grid.best_index_)
#print(grid.scorer_)
#print(grid.n_splits_)
#print(grid.refit_time_)
#print(grid.cv_results_)

# Make predictions
y_pred = grid.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.124 total time=  13.9s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.203 total time=  15.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.126 total time=  11.7s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.126 total time=  12.4s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.125 total time=  12.5s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.124 total time=  11.7s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.126 total time=  11.8s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.125 total time=  11.6s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.126 total time=  11.3s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.125 total time=  10.7s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.338 total time=  10.1s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [10]:
# Let's now try to train a Random Forest model
from sklearn.ensemble import RandomForestClassifier

'''
# Use PCA to reduce the dimensionality of the data
pca = PCA(n_components=5)
X_cross_validation = pca.fit_transform(X_cross_validation)
X_test = pca.transform(X_test)
'''

# Define the parameter grid
param_grid = {'n_estimators': [10, 100, 1000], 'max_features': ['auto', 'sqrt', 'log2']}
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, verbose=3, cv=5)

# Fit the model
grid.fit(X_cross_validation, y_cross_validation)

# Print the best parameters
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)
#print(grid.best_index_)
#print(grid.scorer_)
#print(grid.n_splits_)
#print(grid.refit_time_)
#print(grid.cv_results_)

# Make predictions
y_pred = grid.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ..max_features=auto, n_estimators=10;, score=nan total time=   0.0s
[CV 2/5] END ..max_features=auto, n_estimators=10;, score=nan total time=   0.0s
[CV 3/5] END ..max_features=auto, n_estimators=10;, score=nan total time=   0.0s
[CV 4/5] END ..max_features=auto, n_estimators=10;, score=nan total time=   0.0s
[CV 5/5] END ..max_features=auto, n_estimators=10;, score=nan total time=   0.0s
[CV 1/5] END .max_features=auto, n_estimators=100;, score=nan total time=   0.0s
[CV 2/5] END .max_features=auto, n_estimators=100;, score=nan total time=   0.0s
[CV 3/5] END .max_features=auto, n_estimators=100;, score=nan total time=   0.0s
[CV 4/5] END .max_features=auto, n_estimators=100;, score=nan total time=   0.0s
[CV 5/5] END .max_features=auto, n_estimators=100;, score=nan total time=   0.0s
[CV 1/5] END max_features=auto, n_estimators=1000;, score=nan total time=   0.0s
[CV 2/5] END max_features=auto, n_estimators=1000

15 fits failed out of a total of 45.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Cours\Echange Milan\Audio Pattern Recognition\music-genre-recognition\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Cours\Echange Milan\Audio Pattern Recognition\music-genre-recognition\venv\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "d:\Cours\Echange Milan\Audio Pattern Recognition\music-genre-recognition\venv\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints

{'max_features': 'sqrt', 'n_estimators': 1000}
RandomForestClassifier(n_estimators=1000)
0.5423969280165777
               precision    recall  f1-score   support

   Electronic       0.49      0.58      0.53       100
 Experimental       0.29      0.19      0.23       100
         Folk       0.22      0.21      0.21       100
      Hip-Hop       0.58      0.60      0.59       100
 Instrumental       0.43      0.47      0.45       100
International       0.45      0.49      0.47       100
          Pop       0.36      0.35      0.36       100
         Rock       0.59      0.59      0.59       100

     accuracy                           0.43       800
    macro avg       0.43      0.43      0.43       800
 weighted avg       0.43      0.43      0.43       800

