In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb


In [2]:
def create_arbitrary_groups(genre_list, num_groups=19):
    sorted_genres = sorted(list(set(genre_list)))
    genre_chunks = np.array_split(sorted_genres, num_groups)
    grouped_genre_dict = {}
    for i, chunk in enumerate(genre_chunks):
        group_name = f"Arbitrary_Group_{i + 1:02d}"
        grouped_genre_dict[group_name] = chunk.tolist()
    return grouped_genre_dict


In [3]:
data = pd.read_csv('/kaggle/input/-spotify-tracks-dataset/dataset.csv')
data = data.drop(labels=['Unnamed: 0', 'track_id', 'album_name', 'track_name'], axis=1)
data = data.dropna()

In [4]:
data.describe()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,113999.0,113999.0,113999.0,113999.0,113999.0,113999.0,113999.0,113999.0,113999.0,113999.0,113999.0,113999.0,113999.0,113999.0
mean,33.238827,228031.2,0.566801,0.641383,5.309126,-8.25895,0.637558,0.084652,0.314907,0.156051,0.213554,0.474066,122.147695,3.904034
std,22.304959,107296.1,0.173543,0.25153,3.559999,5.029357,0.480708,0.105733,0.332522,0.309556,0.190378,0.259261,29.97829,0.432623
min,0.0,8586.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,17.0,174066.0,0.456,0.472,2.0,-10.013,0.0,0.0359,0.0169,0.0,0.098,0.26,99.2185,4.0
50%,35.0,212906.0,0.58,0.685,5.0,-7.004,1.0,0.0489,0.169,4.2e-05,0.132,0.464,122.017,4.0
75%,50.0,261506.0,0.695,0.854,8.0,-5.003,1.0,0.0845,0.5975,0.049,0.273,0.683,140.071,4.0
max,100.0,5237295.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,1.0,0.995,243.372,5.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   artists           113999 non-null  object 
 1   popularity        113999 non-null  int64  
 2   duration_ms       113999 non-null  int64  
 3   explicit          113999 non-null  bool   
 4   danceability      113999 non-null  float64
 5   energy            113999 non-null  float64
 6   key               113999 non-null  int64  
 7   loudness          113999 non-null  float64
 8   mode              113999 non-null  int64  
 9   speechiness       113999 non-null  float64
 10  acousticness      113999 non-null  float64
 11  instrumentalness  113999 non-null  float64
 12  liveness          113999 non-null  float64
 13  valence           113999 non-null  float64
 14  tempo             113999 non-null  float64
 15  time_signature    113999 non-null  int64  
 16  track_genre       113999 

In [6]:
all_unique_genres = data['track_genre'].unique()

genre_map_19 = create_arbitrary_groups(all_unique_genres, num_groups=19)

In [7]:
subgenre_to_parent_map = {}
for parent_group, subgenres in genre_map_19.items():
    for genre in subgenres:
        subgenre_to_parent_map[genre] = parent_group

# 6. --- Create the new column ---
data['arbitrary_group'] = data['track_genre'].map(subgenre_to_parent_map)

print("\n--- DataFrame with new 'arbitrary_group' column ---")
print(data[['track_genre', 'arbitrary_group']].head())


--- DataFrame with new 'arbitrary_group' column ---
  track_genre     arbitrary_group
0    acoustic  Arbitrary_Group_01
1    acoustic  Arbitrary_Group_01
2    acoustic  Arbitrary_Group_01
3    acoustic  Arbitrary_Group_01
4    acoustic  Arbitrary_Group_01


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   artists           113999 non-null  object 
 1   popularity        113999 non-null  int64  
 2   duration_ms       113999 non-null  int64  
 3   explicit          113999 non-null  bool   
 4   danceability      113999 non-null  float64
 5   energy            113999 non-null  float64
 6   key               113999 non-null  int64  
 7   loudness          113999 non-null  float64
 8   mode              113999 non-null  int64  
 9   speechiness       113999 non-null  float64
 10  acousticness      113999 non-null  float64
 11  instrumentalness  113999 non-null  float64
 12  liveness          113999 non-null  float64
 13  valence           113999 non-null  float64
 14  tempo             113999 non-null  float64
 15  time_signature    113999 non-null  int64  
 16  track_genre       113999 

In [9]:
data['arbitrary_group'] = pd.factorize(data['arbitrary_group'])[0]

In [10]:
data['artists'] = pd.factorize(data['artists'])[0]

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   artists           113999 non-null  int64  
 1   popularity        113999 non-null  int64  
 2   duration_ms       113999 non-null  int64  
 3   explicit          113999 non-null  bool   
 4   danceability      113999 non-null  float64
 5   energy            113999 non-null  float64
 6   key               113999 non-null  int64  
 7   loudness          113999 non-null  float64
 8   mode              113999 non-null  int64  
 9   speechiness       113999 non-null  float64
 10  acousticness      113999 non-null  float64
 11  instrumentalness  113999 non-null  float64
 12  liveness          113999 non-null  float64
 13  valence           113999 non-null  float64
 14  tempo             113999 non-null  float64
 15  time_signature    113999 non-null  int64  
 16  track_genre       113999 

In [12]:
X_train = []
Y_train = []
X_train_groups = []
Y_train_groups = []
X_test = []
Y_test = []

In [13]:
for i in data['arbitrary_group'].unique():
    data_dum = data[data['arbitrary_group'] == i]
    data_dum['track_genre'] = pd.factorize(data_dum['track_genre'])[0]
    X = np.array(data_dum.drop(['track_genre', 'arbitrary_group'], axis=1))
    Y = np.array(data_dum[['track_genre', 'arbitrary_group']])
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    X_train_groups.append(x_train)
    Y_train_groups.append(y_train)
    X_train.extend(x_train)
    Y_train.extend(y_train)
    X_test.extend(x_test)
    Y_test.extend(y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dum['track_genre'] = pd.factorize(data_dum['track_genre'])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dum['track_genre'] = pd.factorize(data_dum['track_genre'])[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dum['track_genre'] = pd.factorize(data_dum['track_genre'])[0]
A va

In [14]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [15]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(91199, 16) (91199, 2)
(22800, 16) (22800, 2)


In [16]:
print(Y_train[:, 1])

[ 0  0  0 ... 18 18 18]


In [17]:
xgmodel = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=7, gamma=10, random_state=42,reg_alpha=0.2,reg_lambda=0.5,nthread=-1)
xgmodel.fit(X_train, Y_train[:, 1])

In [18]:
print(classification_report(Y_test[:, 1], xgmodel.predict(X_test)))

              precision    recall  f1-score   support

           0       0.61      0.94      0.74      1200
           1       0.69      0.91      0.78      1200
           2       0.83      0.98      0.90      1200
           3       0.65      0.95      0.77      1200
           4       0.87      0.93      0.90      1200
           5       0.58      0.79      0.67      1200
           6       0.75      0.78      0.77      1200
           7       0.73      0.76      0.74      1200
           8       0.85      0.76      0.80      1200
           9       0.86      0.77      0.82      1200
          10       0.84      0.80      0.82      1200
          11       0.73      0.75      0.74      1200
          12       0.82      0.76      0.79      1200
          13       0.91      0.69      0.79      1200
          14       0.72      0.40      0.52      1200
          15       0.90      0.73      0.81      1200
          16       0.87      0.69      0.77      1200
          17       0.97    

In [19]:
models = {}

In [20]:
for i in range(0, 19):
    model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=7, gamma=10, random_state=42)
    model.fit(X_train_groups[i], Y_train_groups[i][:, 0])
    models[i] = model

In [21]:
preds = []

In [22]:
for i in X_test:
    p1 = xgmodel.predict(i.reshape(1, -1))
    p2 = models[p1[0]].predict(i.reshape(1, -1))
    preds.append([p2[0], p1[0]])

In [23]:
preds = np.array(preds)

In [24]:
compa = (preds == Y_test)

In [25]:
new_preds = np.array(np.all(compa, axis=1))
print(new_preds)

[ True  True  True ...  True  True False]


In [26]:
new_preds = np.where(new_preds == True)
print(new_preds)

(array([    0,     1,     2, ..., 22796, 22797, 22798]),)


In [27]:
print(len(new_preds[0]))

16659


In [28]:
print(len(new_preds[0]))
print(len(Y_test))
print(len(new_preds[0]) / len(Y_test))

16659
22800
0.7306578947368421
