In [48]:
import pandas as pd
%config IPCompleter.greedy=True
pd.set_option('display.max_columns', None)

In [35]:
df = pd.read_csv('./Spotify-2000.csv', thousands=',')
df.head

<bound method NDFrame.head of       Index                   Title  ... Speechiness Popularity
0         1                 Sunrise  ...           3         71
1         2             Black Night  ...           7         39
2         3          Clint Eastwood  ...          17         69
3         4           The Pretender  ...           4         76
4         5  Waitin' On A Sunny Day  ...           3         59
...     ...                     ...  ...         ...        ...
1989   1990        Heartbreak Hotel  ...           7         63
1990   1991               Hound Dog  ...           6         69
1991   1992         Johnny B. Goode  ...           7         74
1992   1993               Take Five  ...           4         65
1993   1994          Blueberry Hill  ...           3         56

[1994 rows x 15 columns]>

In [36]:
print(len(df['Artist'].value_counts()))

731


In [37]:
#Drop Index because we don't need two indices. 
#Drop title because each title is different and it adds no information.
#Drop artist because there are 700 different artists and one-hot-encoding each one will add a bunch of noise to our models
df.drop(['Index', 'Title', 'Artist'], axis=1, inplace=True)
df.head

<bound method NDFrame.head of                 Top Genre  Year  ...  Speechiness  Popularity
0         adult standards  2004  ...            3          71
1              album rock  2000  ...            7          39
2     alternative hip hop  2001  ...           17          69
3       alternative metal  2007  ...            4          76
4            classic rock  2002  ...            3          59
...                   ...   ...  ...          ...         ...
1989      adult standards  1958  ...            7          63
1990      adult standards  1958  ...            6          69
1991           blues rock  1959  ...            7          74
1992                bebop  1959  ...            4          65
1993      adult standards  1959  ...            3          56

[1994 rows x 12 columns]>

In [40]:
# Standardize numerical values
def standardize_column(column):
  return (column-column.min())/(column.max()-column.min())

df_encoded = df.copy()
columns_to_standardize = ['Beats Per Minute (BPM)', 'Energy', 'Danceability', 'Loudness (dB)', 'Liveness', 'Valence', 'Length (Duration)', 'Acousticness', 'Speechiness', 'Popularity', 'Year']
for col in columns_to_standardize:
  df_encoded[col] = standardize_column(df_encoded[col])

df_encoded = df_encoded.join(pd.get_dummies(df_encoded['Top Genre']))
df_encoded = df_encoded.drop('Top Genre', axis=1)
df_encoded.head

<bound method NDFrame.head of           Year  Beats Per Minute (BPM)    Energy  ...  trance  uk pop  yacht rock
0     0.761905                0.710059  0.278351  ...       0       0           0
1     0.698413                0.579882  0.783505  ...       0       0           0
2     0.714286                0.775148  0.680412  ...       0       0           0
3     0.809524                0.804734  0.958763  ...       0       0           0
4     0.730159                0.408284  0.814433  ...       0       0           0
...        ...                     ...       ...  ...     ...     ...         ...
1989  0.031746                0.337278  0.185567  ...       0       0           0
1990  0.031746                0.816568  0.752577  ...       0       0           0
1991  0.047619                0.775148  0.793814  ...       0       0           0
1992  0.047619                0.810651  0.237113  ...       0       0           0
1993  0.047619                0.568047  0.484536  ...       0       

Try Clustering

In [50]:
from sklearn.cluster import KMeans

model = KMeans().fit(df_encoded)

df_display = df_encoded.copy()
df_display['Cluster'] = model.labels_
df_display['Year'] = df['Year']
print(df_display.groupby(by='Cluster').mean())

                Year  Beats Per Minute (BPM)    Energy  Danceability  \
Cluster                                                                
0        1980.883777                0.502543  0.582986      0.481530   
1        1989.486352                0.490889  0.687933      0.627878   
2        1994.711590                0.450502  0.349765      0.450793   
3        2007.079545                0.532477  0.557170      0.478462   
4        1996.558140                0.484244  0.682570      0.448080   
5        2004.772834                0.520419  0.718414      0.463319   
6        1979.016260                0.473998  0.408851      0.455379   
7        2004.072289                0.485635  0.633586      0.588960   

         Loudness (dB)  Liveness   Valence  Length (Duration)  Acousticness  \
Cluster                                                                       
0             0.679419  0.164175  0.517933           0.153807      0.271186   
1             0.744020  0.197513  0.732604