In [None]:
#Import libraries
import numpy as np

import pandas as pd

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

In [None]:
#Import datasheet with pandas
#rdata = rawdata
rdata = pd.read_csv('data1.csv')

#Display data - raw and unfiltered
rdata

In [None]:
#Drops unneccesary columns to save space 
#data = droppeddata
ddata = rdata.drop(columns=['duration_ms','explicit','id','release_date'])

#Rids of ('' [] "") in artist column
#da is originally a 'Series', use .to_frame() to make into DataFrame
da = ddata.artists.str.strip('["]\'').to_frame()
#da = dataframeartist
da

In [None]:
#must put cleaned artists column into a new DataFrame
df = ddata.assign(artists=da['artists'])
#df = dataframe - will be using from now on
df

In [None]:
#Display data type for all columns
df.info()

In [None]:
#Display some statistics of the data
#There are no null entries (count is same in all columns)
df.describe()

In [None]:
#dfw = dataframe w/o name and artists
dfw=df.drop(columns = ['name','artists'])

In [None]:
#Randomly selects a song in the training sample and retrieves its features
#rsong = random song, fsong = featuresong
rsong = df.sample(7)
fsong = rsong[['acousticness','danceability','energy','instrumentalness','loudness','speechiness','tempo','valence']]
fsong

In [None]:
features = dfw[['acousticness','danceability','energy','instrumentalness','loudness','speechiness','tempo','valence']]

In [None]:
#Gets specific song by searching for name of song and artists.
#Some searches have duplicate records because of re-recordings in different years
#Such as name = Virgen, artists = Adolescent'ts Qrquesta
song= df.loc[(df['name'] == "Do I Wanna Know?") & (df['artists'] == 'Arctic Monkeys')]
songf = song[['artists','name','acousticness','danceability','energy','instrumentalness','loudness','speechiness','tempo','valence']]
songf

In [None]:
#Using the elbow method to find the optimal clustering amount
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(dfw)
    distortions.append(kmeanModel.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, distortions)
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Optimal k')
plt.show()
#4

In [None]:
km = KMeans(n_clusters = 4)
km.fit(dfw)
cluster = km.labels_

In [None]:
newdf=dfw.assign(Cluster=cluster)
grouped=newdf.groupby(["Cluster"])
grouped.agg({'acousticness' : 'mean','danceability' : 'mean','energy' : 'mean','instrumentalness' : 'mean','loudness' : 'mean',
             'speechiness' : 'mean','tempo' : 'mean','valence' : 'mean',})

In [None]:
newdf

In [None]:
newdf_melt=pd.melt(newdf.reset_index(), id_vars =['Cluster'],value_vars=['acousticness','danceability','energy','instrumentalness','loudness','speechiness','tempo','valence'],var_name='Metric',value_name='Value')

In [None]:
#sorts clustered DataFrame by year
result = newdf.sort_values(by='year')
result

In [None]:
c0 = newdf[newdf['Cluster']==0].reindex()
c1 = newdf[newdf['Cluster']==1].reindex()
c2 = newdf[newdf['Cluster']==2].reindex()
c3 = newdf[newdf['Cluster']==3].reindex()

In [None]:
c0

In [None]:
#add artists and song name back into clusters (at end of DataFrame)
#c#a = cluster1 w/ artist
#c#f = cluster1 full data
c0a = c0.assign(artists=da['artists'])
c0f = c0a.assign(name=df['name'])

c1a = c1.assign(artists=da['artists'])
c1f = c1a.assign(name=df['name'])

c2a = c2.assign(artists=da['artists'])
c2f = c2a.assign(name=df['name'])

c3a = c3.assign(artists=da['artists'])
c3f = c3a.assign(name=df['name'])

In [None]:
#puts artists and name at beginning of DataFrame
neworder = ['name','artists','year','acousticness','danceability','energy','instrumentalness','key','liveness',
            'loudness','mode','popularity','speechiness','tempo','valence','Cluster']
c0f = c0f.reindex(columns = neworder)
c1f = c1f.reindex(columns = neworder)
c2f = c2f.reindex(columns = neworder)
c3f = c3f.reindex(columns = neworder)

In [None]:
#look at rebuilt DataFrame
c0f.sort_values('year').tail()

In [None]:
#searches song by index in specific cluster by index and assigns index to variable
#songbi = song by index with row
#sname = song index
songbi = c0f.loc[26433]
sname = songbi.name
sname

In [None]:
#searches index variable in original DataFrame
#ss = searched song
ssong = df.iloc[sname]
ssong

In [None]:
#searches for 10 songs in same cluster within 5 years of song searched
c0f[(ssong.year >= (c0f['year'] - 5)) & (ssong.year <= (c0f['year'] + 5))].sample(10)

In [None]:
#Gets index of specific song by name
#Searches song in DataFrame before clustering
#songn = song name
songn = ssong['name']
year = ssong['year']
idx = df[df['name'] == 'Shake It Off'].index# & (df['year'] == year))].index
df.loc[idx]

In [None]:
#combines all clustered DataFrames into one DataFrame
fourframes = [c0f,c1f,c2f,c3f]
allsongs = pd.concat(fourframes)
allsongs

In [None]:
#checks to see if DataFrames matches
allsongs.loc[sname]

In [None]:
#add cluster column to rdata DataFrame
fullrdata = rdata.assign(cluster=allsongs['Cluster'])
fullrdata