In [10]:
import pandas as pd
pd.options.display.max_columns = 200

songs = pd.read_csv('data/spotify_data.csv', index_col=[0])
songs = songs.drop_duplicates(['track_id'])
# Drop rows with missing values
songs.dropna(inplace=True)

In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import MinMaxScaler

songs_data = songs.drop(columns = ["track_id", "artists", "album_name", "track_name", "track_genre"])
# Normalize data before clustering since clustering relies on distances
scaler = MinMaxScaler()
songs_data[['duration_ms', 'popularity', 'tempo', 'time_signature', 'loudness', 'key']] = scaler.fit_transform(songs_data[['duration_ms', 'popularity', 'tempo', 'time_signature', 'loudness', 'key']])
genres = songs["track_genre"]
# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)
# Using stratify might help because we have an imbalanced dataset
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.2, 
                                                    stratify=encoded_genres, shuffle=True, random_state=50)
# Train model
mlp = MLPClassifier()
mlp.fit(X_train, y_train)



In [12]:
# Evaluate model
predictions = mlp.predict(X_test)
base_accuracy = accuracy_score(y_test, predictions)
base_f1_weighted = f1_score(y_test, predictions, average='weighted')
print("Baseline performance using an MLP")
print(f"Accuracy: {base_accuracy}")
print(f"F1-score: {base_f1_weighted}")

Baseline performance using an MLP
Accuracy: 0.3472810340985068
F1-score: 0.3249424533653904
