In [446]:
import random
random.seed(12556949)

In [447]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import silhouette_samples, silhouette_score, euclidean_distances
from sklearn import cluster
from sklearn.metrics import roc_auc_score
import torch
from torch import nn
from IPython.display import clear_output

In [448]:
df = pd.read_csv('musicData.csv')
df.drop(['instance_id', 'artist_name', 'track_name', 'obtained_date'], axis = 1, inplace=True)
df.replace('?', np.nan, inplace=True)

In [449]:
# clean the data in column 'key'
df['key'] = pd.factorize(df['key'])[0]
df['key'].replace(-1, np.nan, inplace=True)
key_mean = df['key'].mean()
df['key'].fillna(key_mean, inplace=True)

In [450]:
# turn column 'mode' to 1 or 0
df['mode'].replace('Major', 1, inplace=True)
df['mode'].replace('Minor', 0, inplace=True)
df.dropna(subset=['mode'], inplace=True)

In [451]:
df.dropna(subset=['music_genre'], inplace=True)

In [452]:
df.replace(-1,np.nan, inplace=True)

In [453]:
df['duration_ms'].fillna(df['duration_ms'].mean(skipna=True), inplace=True)
df['tempo'].fillna(df['tempo'].astype(float).mean(skipna=True), inplace=True)
df['music_genre'] = pd.factorize(df['music_genre'])[0]

In [454]:
test_size = 500
genres = df['music_genre'].unique()

train_data = []
test_data = []
for genre in genres:
    # Split data for this genre
    genre_data = df[df['music_genre'] == genre].copy()
    train, test = train_test_split(genre_data, test_size=test_size)
    train_data.append(train)
    test_data.append(test)

# Combine train and test data for each genre
train_data = pd.concat(train_data)
test_data = pd.concat(test_data)

# Separate the target variable from the features
X_train = train_data.drop('music_genre', axis=1)
y_train = train_data['music_genre']
X_test = test_data.drop('music_genre', axis=1)
y_test = test_data['music_genre']


In [455]:
X_train

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
1474,41.0,0.076400,0.421,262533.000000,0.944,0.815000,2.0,0.1190,-9.179,1.0,0.3020,162.179,0.222
3758,38.0,0.013800,0.628,368277.000000,0.719,0.095100,2.0,0.2060,-5.443,1.0,0.0410,119.952961,0.371
1056,51.0,0.004920,0.888,355440.000000,0.780,0.356000,0.0,0.0675,-8.711,0.0,0.0797,119.952961,0.746
3822,45.0,0.000672,0.672,265250.000000,0.897,0.000618,4.0,0.0707,-4.514,1.0,0.4160,140.026,0.360
1622,29.0,0.001910,0.334,174545.000000,0.803,0.005800,0.0,0.1150,-6.126,0.0,0.1160,78.711,0.132
...,...,...,...,...,...,...,...,...,...,...,...,...,...
46608,50.0,0.012100,0.734,155063.000000,0.626,0.000043,7.0,0.3730,-4.550,1.0,0.2560,149.977,0.542
45629,50.0,0.178000,0.714,245503.541466,0.709,0.000000,9.0,0.0375,-5.834,1.0,0.1150,132.843,0.933
48096,60.0,0.072200,0.851,193907.000000,0.405,0.000001,3.0,0.1080,-9.775,1.0,0.2050,149.98,0.292
47946,50.0,0.042300,0.881,237733.000000,0.889,0.000001,5.0,0.1080,-5.401,1.0,0.0944,106.054,0.492


In [456]:
# Dimensionality reduction and clustering
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# Determine number of components for PCA
pca = PCA()
pca.fit(X_train)
sum(pca.explained_variance_ >1)


5

In [457]:
pca = PCA(n_components=5)
X_pca_train = pca.fit_transform(X_train_scaled)
X_pca_test = pca.fit_transform(X_test_scaled)


In [458]:
kmeans = KMeans(n_clusters=7, random_state = 20)
X_clustered_train = kmeans.fit_transform(X_pca_train)
X_clustered_test = kmeans.fit_transform(X_pca_test)
y_pred = kmeans.predict(X_pca_test)



In [459]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_clustered_train, y_train)
y_prob = dtc.predict_proba(X_clustered_test)
auc = roc_auc_score(y_test, y_prob, multi_class='ovo')
print('AUC:', auc)

AUC: 0.5157665333333332


In [483]:
from imblearn.over_sampling import RandomOverSampler
learning_rate = 1e-1
lambda_l2 = 1e-3
D = X_clustered_train.shape[1]
H = 100
C = len(np.unique(y_train))



# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_clustered_train.astype(np.float32))
X_val_tensor = torch.tensor(X_clustered_test.astype(np.float32))
Y_train_tensor = torch.tensor(y_train.values).long()
Y_val_tensor = torch.tensor(y_test.values).long()

model = nn.Sequential(
    nn.Linear(D, H),
    nn.Linear(H, H),
    nn.Linear(H, C)
)
# nn package has a variety of loss functions already implemented
# we use cross entropy loss for our classification task
criterion = torch.nn.CrossEntropyLoss()

# nn package also has a variety of optimization algorithms implemented
# we use the stochastic gradient descent for our parameter updates
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=lambda_l2) # built-in L2

# Training loop
for t in range(1000):
    
    # Forward pass over the model to get the logits 
    y_pred = model(X_train_tensor)
    
    # Compute the loss and accuracy
    loss = criterion(y_pred, Y_train_tensor)
    score, predicted = torch.max(y_pred, 1)
    acc = (Y_train_tensor == predicted).sum().float() / len(Y_train_tensor)
    print("[EPOCH]: %i, [LOSS]: %.6f, [ACCURACY]: %.3f" % (t, loss.item(), acc))
    clear_output(wait=True)
    
    # reset (zero) the gradients before running the backward pass over the model
    # we need to do this because the gradients get accumulated at the same place across iterations
    optimizer.zero_grad()
    
    # Backward pass to compute the gradient of loss w.r.t our learnable params (weights and biases)
    loss.backward()
    
    # Update params
    optimizer.step()


# Assuming you have trained a feedforward neural network and obtained predictions
y_pred = model(X_val_tensor)  # Replace with your actual predictions


[EPOCH]: 999, [LOSS]: 1.684719, [ACCURACY]: 0.355


In [492]:
import torch.nn.functional as F

# Assuming y_pred is a tensor of predicted class probabilities
y_pred_softmax = F.softmax(y_pred, dim=1)
auc = roc_auc_score(y_test, y_pred_softmax.detach().numpy(), multi_class='ovr')

print("AUC: ", auc)

AUC:  0.5260558888888889


In [489]:
y_pred.detach()

tensor([[ 3.2494, -3.3737,  3.0332,  ..., -1.0874, -2.6503,  0.1268],
        [ 2.8062, -4.3979,  2.7067,  ..., -0.5736, -3.0037,  1.3984],
        [ 3.3601, -2.7442,  2.0167,  ..., -0.3300, -1.7702, -0.2195],
        ...,
        [ 3.5846, -2.2582,  2.9637,  ..., -0.9023, -2.0093, -1.3161],
        [ 4.0706, -1.7015,  2.0832,  ..., -0.2830, -1.6218, -1.4422],
        [ 3.7817, -2.2943,  2.3530,  ..., -0.6404, -2.0219, -0.8955]])

In [360]:
X_train_tensor

tensor([[-1.0937,  1.5280, -0.1485, -0.5701,  0.0079],
        [-1.7533,  0.7126, -0.4108, -0.8609, -0.0078],
        [ 0.3162,  1.4926,  0.7988,  1.4173,  1.8038],
        ...,
        [-1.0355, -1.6388,  0.5022,  0.2238,  1.1933],
        [-2.2131, -2.3033, -0.3562,  1.3131,  0.2326],
        [-0.7043, -1.1150,  0.4368,  1.6296, -0.1339]])

In [355]:
Y_val_tensor.shape

torch.Size([5000])

In [356]:
X_val_tensor.shape

torch.Size([5000, 5])

In [359]:
Y_train_tensor.shape

torch.Size([45000])