## Gather data

In [None]:
import pandas as pd
import joblib
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
# Load the data from csv file
df = pd.read_csv('data/spotify_songs.csv')

In [None]:
df.head()

In [None]:
df['playlist_subgenre'].str.contains('hip hop').sum()

In [None]:
df2 = pd.read_csv('data/spotify_songs_2.csv')


In [None]:
# Get common columns from both dataframes
common_columns = df.columns.intersection(df2.columns)


In [None]:
common_columns

In [None]:
df = df[common_columns]
df2 = df2[common_columns]

In [None]:
# Combine those two dataframes
df = pd.concat([df, df2], ignore_index=True)

In [None]:
# Check for duplicates in pairs track name : track_artist
df.duplicated(subset=['track_name', 'track_artist']).sum()

In [None]:
# Delete those duplicates
df.drop_duplicates(subset=['track_name', 'track_artist'], inplace=True)

In [None]:
# Convert duration_ms to duration (in seconds)
df['duration'] = df['duration_ms'] / 1000

In [None]:
df[df['playlist_genre'].isin(['rock', 'pop', 'rap'])]

In [None]:
df['playlist_genre'].unique()

In [None]:
# Print unique values of playlist subgenre for every unique playlist genre
for genre in df['playlist_genre'].unique():
    print(genre)
    print(df[df['playlist_genre'] == genre]['playlist_subgenre'].unique())

In [None]:
df['playlist_subgenre'].unique()

In [None]:
# Get year_released from track_album_release_date (date is either in format yyyy-mm-dd or yyyy)
df['year_released'] = df['track_album_release_date'].str.extract(r'(\d{4})')
df.year_released.astype('int').min()

In [None]:
# Load from path /Users/julia.rozycka/Master/multi_modal_genre_classification/data/fma_full_echonest_lyrics.csv

# Combine tracks with echonest

# Load fma data
fma = pd.read_csv('data/fma/tracks.csv', header=[0, 1], skiprows=[2])
fma = fma.rename(columns={"Unnamed: 0_level_1": "track_id"}, level=1)
fma = fma.rename(columns={"Unnamed: 0_level_0": "track"}, level=0)

# Load echonest data
echonest = pd.read_csv('data/fma/echonest.csv', header=[0,1], skiprows=[1,3])

# Drop last 100 columns
echonest = echonest.iloc[:, :-224]
echonest = echonest.rename(columns={"Unnamed: 0_level_1": "track_id"}, level=1)
echonest = echonest.rename(columns={"Unnamed: 0_level_0": "track"}, level=0)

# Change track_id to int
fma[('track', 'track_id')] = fma[('track', 'track_id')].astype(int)
echonest[('track', 'track_id')] = echonest[('track', 'track_id')].astype(int)

In [None]:
# Merge fma and echonest
feg = pd.merge(echonest, fma, on=[('track', 'track_id')], how='inner')

genres = pd.read_csv('data/fma/genres.csv')

def get_genres(row):
    return ', '.join(genres[genres['genre_id'].isin(eval(row[('track', 'genres')]))]['title'])

def get_genres_all(row):
    return ', '.join(genres[genres['genre_id'].isin(eval(row[('track', 'genres_all')]))]['title'])

feg[('track', 'genres')] = feg.apply(get_genres, axis=1)
feg[('track', 'genres_all')] = feg.apply(get_genres_all, axis=1)

In [None]:
feg[('track', 'genre_top')].value_counts()

In [None]:
feg[('track', 'genres')].str.split(', ').explode().value_counts()

In [None]:
feg[('track', 'genres_all')].str.split(', ').explode().value_counts()

In [None]:
fma_country = pd.DataFrame(columns=feg.columns)

In [None]:
# My desired 8 genres are rock, pop, rap, hip hop, indie, country, jazz, metal
# I have rock, pop, rap, hip hop, indie so I need country, jazz and metal from this dataset (country can be folk)

# First let's handle country -> look for rows containing Country in genres_all or genres
#feg[feg[('track', 'genres_all')].str.contains('Country', case=False) | feg[('track', 'genres')].str.contains('Country', case=False)]

# Let's add this to fma_country dataframe use concat
fma_country = pd.concat([fma_country, feg[feg[('track', 'genres_all')].str.contains('Country', case=False) | feg[('track', 'genres')].str.contains('Country', case=False)]], ignore_index=True)
fma_country

In [None]:
# Let's see if there is genre very similar to Country like: Folk in genres
fma_country = pd.concat([fma_country, feg[feg[('track', 'genres')].str.contains('Folk', case=False)]], ignore_index=True)

In [None]:
fma_country.drop_duplicates(subset=[('artist', 'name'),('track', 'title')], inplace=True)

In [None]:
# Add new column track echonest genre to fma_country
fma_country[('echonest', 'genre')] = 'Country'

In [None]:
fma_country

In [None]:
# Exclude fma_country from feg by track_id
feg = feg[~feg[('track', 'track_id')].isin(fma_country[('track', 'track_id')])]
feg.count()

In [None]:
# Now let's add jazz
fma_jazz = pd.DataFrame(columns=feg.columns)
fma_jazz = pd.concat([fma_jazz, feg[feg[('track', 'genre_top')] == 'Jazz']], ignore_index=True)
fma_jazz.track.track_id.count()

In [None]:
# Now check in genres and add
fma_jazz = pd.concat([fma_jazz, feg[feg[('track', 'genres_all')].str.contains('Jazz', case=False) | feg[('track', 'genres')].str.contains('Jazz', case=False)]], ignore_index=True)
fma_jazz.track.track_id.count()

In [None]:
fma_jazz.drop_duplicates(subset=[('artist', 'name'),('track', 'title')], inplace=True)

In [None]:
fma_jazz.track.track_id.count()

In [None]:
fma_jazz[('echonest', 'genre')] = 'Jazz'

In [None]:
# Exclude fma_jazz from feg by track_id
feg = feg[~feg[('track', 'track_id')].isin(fma_jazz[('track', 'track_id')])]

In [None]:
feg.track.track_id.count()

In [None]:
# Now let's add metal
fma_metal = pd.DataFrame(columns=feg.columns)
fma_metal = pd.concat([fma_metal, feg[feg[('track', 'genre_top')] == 'Metal']], ignore_index=True)
fma_metal.track.track_id.count()

In [None]:
# Now check in genres and add
fma_metal = pd.concat([fma_metal, feg[feg[('track', 'genres_all')].str.contains('Metal', case=False) | feg[('track', 'genres')].str.contains('Metal', case=False)]], ignore_index=True)
fma_metal.track.track_id.count()

In [None]:
fma_metal.drop_duplicates(subset=[('artist', 'name'),('track', 'title')], inplace=True)

In [None]:
fma_metal[('echonest', 'genre')] = 'Metal'

In [None]:
fma_metal.track.track_id.count()

In [None]:
# Concat fma_country, fma_jazz, fma_metal
fma_cjm = pd.concat([fma_country, fma_jazz, fma_metal], ignore_index=True)

fma_cjm.drop_duplicates(subset=[('artist', 'name'),('track', 'title')], inplace=True)

In [None]:
fma_cjm.track.track_id.count()

In [None]:
# Take columns from fma_cjm that have 1st header level 'echonest' and second: ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'genre'] + ('track' , 'duration') 

fma_cjm_echonest = fma_cjm[[('echonest', 'danceability'), ('echonest', 'energy'), ('echonest', 'speechiness'), ('echonest', 'acousticness'), ('echonest', 'instrumentalness'), ('echonest', 'liveness'), ('echonest', 'valence'), ('echonest', 'tempo'), ('echonest', 'genre'), ('track', 'duration'), ('album', 'date_released')]]

In [None]:
# Drop 1st level of columns
fma_cjm_echonest.columns = fma_cjm_echonest.columns.droplevel(0)
fma_cjm_echonest['year_released'] = fma_cjm_echonest['date_released'].str.extract(r'(\d{4})')
# When NaN insert median year_released by the genre

# Ensure all values in 'year_released' are numeric
fma_cjm_echonest['year_released'] = pd.to_numeric(fma_cjm_echonest['year_released'], errors='coerce')

# Fill NaN values with the median year_released by genre
fma_cjm_echonest['year_released'] = fma_cjm_echonest['year_released'].fillna(fma_cjm_echonest.groupby('genre')['year_released'].transform('median'))

#fma_cjm_echonest.year_released.astype(int).min()

In [None]:
fma_cjm_echonest['year_released'] = fma_cjm_echonest['year_released'].astype(int)

In [None]:
fma_cjm_echonest

In [None]:
fma_cjm_echonest

In [None]:
# Get common columns from both dataframes
common_columns = df.columns.intersection(fma_cjm_echonest.columns)
common_columns

In [None]:
df

In [None]:
df = df[common_columns]
fma_cjm_echonest = fma_cjm_echonest[common_columns]
# Combine those two dataframes
df = pd.concat([df, fma_cjm_echonest], ignore_index=True)

In [None]:
# Create new column called genre. rock -> when playlist_genre is rock, pop -> when playlist_genre is pop but not indie poptimism, hiphop -> when playlist_subgenre is hip hop or southern hip hop, rap -> when playlist_genre is rap but subgenre is not hip hop, indie when playlist_genre is pop and playlist_subgenre is indie poptimism.

df['genre'] = 'other'
df.loc[df['playlist_genre'] == 'rock', 'genre'] = 'rock'
df.loc[(df['playlist_genre'] == 'pop') &(df['playlist_subgenre'] != 'indie poptimism'), 'genre'] = 'pop'
df.loc[(df['playlist_subgenre'] == 'hip hop') | (df['playlist_subgenre'] == 'southern hip hop'), 'genre'] = 'hip hop'
df.loc[(df['playlist_genre'] == 'rap') & (df['playlist_subgenre'] != 'hip hop'), 'genre'] = 'rap'
df.loc[(df['playlist_genre'] == 'pop') & (df['playlist_subgenre'] == 'indie poptimism'), 'genre'] = 'indie'

In [None]:
# Remove rows when genre is other
df = df[df['genre'] != 'other']

# Remove columns playlist_genre and playlist_subgenre
df.drop(columns=['playlist_genre', 'playlist_subgenre'], inplace=True)

In [None]:
df

In [None]:
# Change genre names to Uppercase and Hip hop to Hip-Hop
df['genre'] = df['genre'].str.title()
df['genre'] = df['genre'].str.replace('Hip Hop', 'Hip-Hop')

In [None]:
df['genre'].value_counts()

In [None]:
# Fatures are danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_ms

# Compare hip hop and hip pop subgenres using PCA

features = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo','duration', 'year_released']

# See how well the features separate hip hop and hip pop in the space

In [None]:
df

In [None]:
#check if year released is not null
df['year_released'].isnull().sum()

## Apply SMOTE and preprocessing

In [None]:
# SMOTE upsample the data for Jazz and Metal genres to 1000 samples

X = df[features]
y = df['genre']

smote = SMOTE(sampling_strategy={'Jazz': 1000, 'Metal': 1000}, random_state=42)
X, y = smote.fit_resample(X, y)

df = pd.DataFrame(X, columns=features)
df['genre'] = y

df['genre'].value_counts()

In [None]:
# Select KBest features to separate hip hop and hip pop
selector = SelectKBest(f_classif, k=8)

X = df[features]
y = df['genre']

X_new = selector.fit_transform(X, y)

# Get the selected features
selected_features = X.columns[selector.get_support()]
selected_features

In [None]:
# Normalize the data

X = df[features]
y = df['genre']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Normalize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
df[selected_features]

### PCA

In [None]:
# Perform PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
pca.fit(df[selected_features])
X = pca.transform(df[selected_features])


In [None]:
# Plot the data in 3d plot with plotly
import plotly.express as px

fig = px.scatter_3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], color=df['genre'])
fig.show()



In [None]:
# Check genres distribution
df['genre'].value_counts()

## Random forest

In [None]:
# Build a model to predict genre based on metadata (features)
from sklearn.metrics import accuracy_score
import optuna

X = df[features]
y = df['genre']

# Train the model \using k-fold cross validation (k=10). Try to optimize the model using optuna. Use random forest classifier. Calculate accuracy for each fold. Calculate mean accuracy for all folds. Optimize with Optuna.

def objective(trial):
    # Do stratified k-fold cross validation and take mean accuracy as the objective
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # Get the parameters
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 10, 100)

    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, max_leaf_nodes=max_leaf_nodes, random_state=42)

    accuracies = []

    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)

        accuracies.append(accuracy_score(y_val, y_pred))

    return sum(accuracies) / len(accuracies)


study = optuna.create_study(direction='maximize', study_name='random_forest_metadata_8genres', storage='sqlite:///random_forest_metadata.db', load_if_exists=True)
study.optimize(objective, n_trials=50)


In [None]:
# Check the model on test set
# Train the model using the best parameters
# best_params = study.best_params
# model = RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], max_leaf_nodes=best_params['max_leaf_nodes'], random_state=42)

# import train test split
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# model.fit(X_train, y_train)
# 
# y_pred = model.predict(X_val)
# accuracy_score(y_val, y_pred)
# # Save model
# 
# joblib.dump(model, 'models/random_forest_metadata_8_genres.joblib')

In [None]:
# Load the model
model = joblib.load('models/random_forest_metadata_8_genres_smote_normalized.joblib')
# Evaluate on test set
y_pred = model.predict(X_test)
# Calculate metrics for inbalanced dataset like f1 score, precision, recall, auc
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=df['genre'].unique()))
accuracy_score(y_test, y_pred)

In [None]:
# Check feature importance
importances = model.feature_importances_

importances_df = pd.DataFrame({'feature': features, 'importance': importances})
importances_df.sort_values(by='importance', ascending=False)

In [None]:
# Plot confusion matrix with target names
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred, labels=df['genre'].unique())
sns.heatmap(cm, annot=True, xticklabels=df['genre'].unique(), yticklabels=df['genre'].unique())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Normalized confusion matrix
import numpy as np

from matplotlib.colors import LinearSegmentedColormap

colors = ["white", "#455681"]  # White to #455681 gradient
custom_cmap = LinearSegmentedColormap.from_list("custom_white_to_blue", colors)

# Normalize the confusion matrix# 
conf_matrix_norm = cm / cm.sum(axis=1)[:, np.newaxis]

plt.rc('text', usetex=True)
plt.rc('font', family='serif')
# Plot the normalized confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_norm, cmap=custom_cmap, annot=True, fmt=".2f", xticklabels=genre_names, yticklabels=genre_names, vmax=1.0)
plt.xlabel("Predicted", fontdict={"fontsize": 12})
plt.ylabel("True", fontdict={"fontsize": 12})
plt.tight_layout()
plt.savefig("confusion_matrix_normalized_metadata.eps", dpi=300)
plt.show()


In [None]:
# Genre
genre_dict = {
'hip hop': 0, 'indie': 1, 'pop': 2, 'rap': 3, 'rock': 4}


In [None]:
# Print normalized confusion matrix with genre names
# TODO: something is off with the names
genre_names = y_test.unique()
cm = pd.DataFrame(cm, columns=genre_names, index=genre_names)
sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('True')

plt.show()


## XGBoost

In [None]:
# Now train the model with XGBClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight

def objective(trial):
    # Do stratified k-fold cross validation and take mean accuracy as the objective
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # Get the parameters
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.5)
    gamma = trial.suggest_float('gamma', 0.01, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    
    
    
    scaler = StandardScaler()
    
    df[features] = scaler.fit_transform(df[features])
    
    X = df[features]

    y = df['genre']

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    
    # Compute weights for classes with compute_sample_weight
    sample_weight = compute_sample_weight('balanced', y)

    model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, gamma=gamma, min_child_weight=min_child_weight, random_state=42)
    
    accuracies = []

    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]

        model.fit(X_train, y_train, sample_weight=sample_weight[train_index])

        y_pred = model.predict(X_val)

        accuracies.append(accuracy_score(y_val, y_pred))

    return sum(accuracies) / len(accuracies)



In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import optuna
from sklearn.preprocessing import StandardScaler

study = optuna.create_study(direction='maximize', study_name='xgb_metadata_v1_8_genres_smote_normalized', storage='sqlite:///random_forest_metadata.db', load_if_exists=True)
study.optimize(objective, n_trials=50)

In [None]:
df

In [None]:
X = df[features]

In [None]:
X

In [None]:
y

In [None]:
# Train the model using the best parameters
best_params = study.best_params
model = XGBClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], learning_rate=best_params['learning_rate'], gamma=best_params['gamma'], min_child_weight=best_params['min_child_weight'], random_state=42)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy_score(y_val, y_pred)



In [None]:
print(list(label_encoder.classes_))
print(list(label_encoder.inverse_transform([0,1,2,3, 4,5,6,7])))

In [None]:
# Save model
import joblib
joblib.dump(model, 'models/xgb_metadata_v1_8_genres_smote_normalized.joblib')

In [None]:
# Import classification report
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

model = joblib.load('models/xgb_metadata_v1_8_genres_smote.joblib')
# Evaluate on test set
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=list(label_encoder.inverse_transform([0,1,2,3, 4,5,6,7]))))
accuracy_score(y_test, y_pred)

In [None]:
# Use latex
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.rc('text', usetex=True)
plt.rc('font', family='serif')

In [None]:
# Plot confusion matrix with genre names
genre_names = list(label_encoder.inverse_transform([0,1,2,3, 4,5,6,7]))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, xticklabels=genre_names, yticklabels=genre_names)
plt.xlabel('Predicted')
plt.ylabel('True')

plt.show()


In [None]:
# Normalize confusion matrix with genre names
from matplotlib.colors import LinearSegmentedColormap

colors = ["#FFFFFF", "#455681"]  # White to #455681 gradient
custom_cmap = LinearSegmentedColormap.from_list("custom_white_to_blue", colors)

# Normalize the confusion matrix# 
conf_matrix_norm = cm / cm.sum(axis=1)[:, np.newaxis]

plt.rc('text', usetex=True)
plt.rc('font', family='serif')
# Plot the normalized confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_norm, cmap=custom_cmap, annot=True, fmt=".2f", xticklabels=genre_names, yticklabels=genre_names, vmax=1.0)
plt.xlabel("Predicted", fontdict={"fontsize": 12})
plt.ylabel("True", fontdict={"fontsize": 12})
plt.tight_layout()
plt.savefig("confusion_matrix_normalized_metadata.eps", dpi=300)
plt.show()



In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize
import numpy as np

classes = list(label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7]))
y_test_bin = label_binarize(y_test, classes=range(len(classes)))
y_pred_prob = model.predict_proba(X_test)  # Get probabilities for ROC computation

fpr = {}
tpr = {}
roc_auc = {}

for i in range(len(classes)):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])


np.save("fpr_meta.npy", fpr)
np.save("tpr_meta.npy", tpr)
np.save("roc_auc_meta.npy", roc_auc)

# Save label
np.save("labels_meta.npy", classes)

# Plot ROC curve for each class
plt.figure(figsize=(8, 6))
for i in range(len(classes)):
    plt.plot(fpr[i], tpr[i], label=f"Class {classes[i]} (AUC = {roc_auc[i]:.2f})")

# Plot diagonal line for random guess
plt.plot([0, 1], [0, 1], 'k--')

plt.title("Multi-Class ROC Curve")
plt.xlabel("False Positive Rate", fontdict={"fontsize": 12})
plt.ylabel("True Positive Rate", fontdict={"fontsize": 12})
plt.legend(loc="lower right", prop={"size": 12})  # Adjust legend location if needed
plt.yticks(fontsize=11)
plt.xticks(fontsize=11)
plt.style.use('fast')
plt.tight_layout()
plt.grid()
plt.savefig("roc_curve_metadata.png", dpi=300)
plt.show()


In [None]:
# PCA for hip hop and indie

df_pop = df[df['genre'] == 'hip hop']
df_indie = df[df['genre'] == 'indie']

combined_hiphop_indie = pd.concat([df_pop, df_indie], ignore_index=True)

X = combined_hiphop_indie[features]
y = combined_hiphop_indie['genre']

# Perform PCA
pca = PCA(n_components=3)
pca.fit(X)
X = pca.transform(X)

# Plot the data in 2d plot with plotly
fig = px.scatter_3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], color=combined_hiphop_indie['genre'])
fig.show()



In [None]:
# PCA for hip hop and indie

df_pop = df[df['genre'] == 'pop']
df_rock = df[df['genre'] == 'rock']

combined_pop_rock = pd.concat([df_pop, df_rock], ignore_index=True)

X = combined_pop_rock[features]
y = combined_pop_rock['genre']

# Perform PCA
pca = PCA(n_components=12)
pca.fit(X)
X = pca.transform(X)

# Plot the data in 2d plot with plotly
fig = px.scatter_3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], color=combined_pop_rock['genre'])
fig.show()


In [None]:
# Print classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=genre_names))

In [None]:
# Compare y_test and y_pred and check where hip hop (0) is predicted as indie (3)

for i in range(len(y_test)):
    if y_test[i] == 0 and y_pred[i] == 3:
        print(f'y_test: {y_test[i]}, y_pred: {y_pred[i]}')
        print(f'index: {i}')
        print(f'features: {X_test.iloc[i]}')
        print(f'genre: {genre_names[y_test[i]]}')
        print(f'predicted genre: {genre_names[y_pred[i]]}')
        print('---')



In [None]:
# Add y pred and y test to X_test
X_test['y_test'] = y_test
X_test['y_pred'] = y_pred

In [None]:
X_test

In [None]:
# Get subset of df where rows are in X_test without ordering by index
is_in_X = df.index.isin(X_test.index)

In [None]:
# Get the rows from df where index is in X_test without ordering by index
df_test = df[is_in_X]

In [None]:
df_test

In [None]:
# Combine X_test and df_test
df_test = pd.concat([X_test, df_test], axis=1)

In [None]:
# Print unique genres and unique y_test values for the genre
for genre in genre_names:
    print(f'Genre: {genre}')
    print(f'Unique y_test values: {df_test[df_test["genre"] == genre]["y_test"].unique()}')
    print('---')