<a href="https://colab.research.google.com/github/JacksonHassell/DSP-Project/blob/main/DSP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [79]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [71]:
df = pd.read_csv('./Spotify-2000.csv', thousands=',')
df.head

<bound method NDFrame.head of       Index                   Title                    Artist  \
0         1                 Sunrise               Norah Jones   
1         2             Black Night               Deep Purple   
2         3          Clint Eastwood                  Gorillaz   
3         4           The Pretender              Foo Fighters   
4         5  Waitin' On A Sunny Day         Bruce Springsteen   
...     ...                     ...                       ...   
1989   1990        Heartbreak Hotel             Elvis Presley   
1990   1991               Hound Dog             Elvis Presley   
1991   1992         Johnny B. Goode               Chuck Berry   
1992   1993               Take Five  The Dave Brubeck Quartet   
1993   1994          Blueberry Hill               Fats Domino   

                Top Genre  Year  Beats Per Minute (BPM)  Energy  Danceability  \
0         adult standards  2004                     157      30            53   
1              album rock  

In [72]:
print(len(df['Artist'].value_counts()))

731


In [73]:
#Drop Index because we don't need two indices. 
#Drop title because each title is different and it adds no information.
#Drop artist because there are 700 different artists and one-hot-encoding each one will add a bunch of noise to our models
df.drop(['Index', 'Title', 'Artist'], axis=1, inplace=True)
df.head

<bound method NDFrame.head of                 Top Genre  Year  Beats Per Minute (BPM)  Energy  Danceability  \
0         adult standards  2004                     157      30            53   
1              album rock  2000                     135      79            50   
2     alternative hip hop  2001                     168      69            66   
3       alternative metal  2007                     173      96            43   
4            classic rock  2002                     106      82            58   
...                   ...   ...                     ...     ...           ...   
1989      adult standards  1958                      94      21            70   
1990      adult standards  1958                     175      76            36   
1991           blues rock  1959                     168      80            53   
1992                bebop  1959                     174      26            45   
1993      adult standards  1959                     133      50            49  

In [74]:
# Standardize numerical values
def standardize_column(column):
  return (column-column.min())/(column.max()-column.min())

df_encoded = df.copy()
columns_to_standardize = ['Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)', 'Liveness', 'Valence', 'Length (Duration)', 'Acousticness', 'Speechiness', 'Popularity', 'Year']
for col in columns_to_standardize:
  df_encoded[col] = standardize_column(df_encoded[col])

df_encoded = df_encoded.join(pd.get_dummies(df_encoded['Top Genre']))
df_encoded = df_encoded.drop('Top Genre', axis=1)
df_encoded.head

<bound method NDFrame.head of           Year  Beats Per Minute (BPM)    Energy  Danceability  Loudness (dB)  \
0     0.761905                0.710059  0.278351      0.500000           0.52   
1     0.698413                0.579882  0.783505      0.465116           0.64   
2     0.714286                0.775148  0.680412      0.651163           0.72   
3     0.809524                0.804734  0.958763      0.383721           0.92   
4     0.730159                0.408284  0.814433      0.558140           0.88   
...        ...                     ...       ...           ...            ...   
1989  0.031746                0.337278  0.185567      0.697674           0.60   
1990  0.031746                0.816568  0.752577      0.302326           0.76   
1991  0.047619                0.775148  0.793814      0.500000           0.72   
1992  0.047619                0.810651  0.237113      0.406977           0.56   
1993  0.047619                0.568047  0.484536      0.453488           0.68  

Try Clustering

Find best k

In [89]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

n_clusters = 5
best_score = 0
for i in range(5, 20, 1):
  model = KMeans(n_clusters = i).fit(df_encoded)
  score = silhouette_score(df_encoded, model.labels_, metric='euclidean')
  print('Score: ' + str(score) + ', ' + str(i))
  if score > best_score:
    n_clusters = i
    best_score = score
print(n_clusters)

Score: 0.18141826603463676, 5
Score: 0.18778497350654605, 6
Score: 0.19981104861809548, 7
Score: 0.21284898155330917, 8
Score: 0.2351537604129091, 9
Score: 0.17265728789290646, 10
Score: 0.25692420717642694, 11
Score: 0.27461385977534647, 12
Score: 0.2876587266588614, 13
Score: 0.29873232503672653, 14
Score: 0.3027583541734753, 15
Score: 0.25228720801928595, 16
Score: 0.25240650844861745, 17
Score: 0.3370892588219846, 18
Score: 0.32559083233448166, 19
18


In [93]:
model = KMeans(n_clusters = n_clusters).fit(df_encoded)

def std(x): return np.std(x)
def var(x): return np.var(x)

df_display = df_encoded.copy()
df_display['Cluster'] = model.labels_
df_display['Genre'] = df['Top Genre']
df_display['Year'] = df['Year']
print(df_display.groupby(by='Cluster').agg(['mean', 'max', std, var]))

                Year                              Beats Per Minute (BPM)  \
                mean   max        std         var                   mean   
Cluster                                                                    
0        1991.634615  2019  10.205634  104.154956               0.520141   
1        1980.631206  2019  11.798793  139.211508               0.505581   
2        1994.970732  2019  16.292722  265.452802               0.464367   
3        1996.558140  2019   8.393803   70.455922               0.484244   
4        1999.800000  2019   8.058004   64.931429               0.517413   
5        2007.079545  2019  10.366271  107.459582               0.532477   
6        1972.083333  2014  13.141062  172.687500               0.486522   
7        1979.016260  2017  15.350636  235.642012               0.473998   
8        2008.367347  2018   6.209584   38.558934               0.550175   
9        2006.307116  2019   9.208532   84.797065               0.515502   
10       200