In [1]:

import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})


In [2]:

data = pd.read_csv('Song_features.csv') 
x = data.loc[:, 'tempo':].values
data.head()


Unnamed: 0,song_name,tempo,duration,rms_mean,rms_var,zcr_mean,zcr_var,cent_mean,cent_var,spec_bw_mean,...,perc_var03,perc_var04,perc_var05,perc_var06,perc_var07,perc_var08,perc_var09,perc_var10,perc_var11,perc_var12
0,'Cos I'm your lady.mp3,135.999178,268.3,0.076404,0.00347,0.092907,0.002327,2150.76694,419685.9885,2366.146405,...,0.079882,0.088056,0.085021,0.102389,0.08288,0.070191,0.090469,0.098475,0.099569,0.081991
1,-Songs-of-War-The-North-Remembers-Game-of-Thro...,99.384014,334.0,0.068129,0.004931,0.047366,0.001988,1437.034965,991637.3602,1984.39906,...,0.072901,0.071644,0.086667,0.076782,0.090326,0.081059,0.057149,0.081281,0.065953,0.079924
2,004 Enna Sona.mp3,99.384014,213.6,0.172036,0.006449,0.059317,0.004618,1625.98124,995284.7308,2190.861978,...,0.104028,0.087329,0.081556,0.105668,0.083438,0.101539,0.088441,0.099496,0.100903,0.071239
3,005 SWIKAR KIYA MAINE = CHAND KE PAAS JO.mp3,135.999178,276.1,0.127634,0.005975,0.154992,0.002889,2378.462399,299236.1006,1970.625599,...,0.094054,0.107579,0.098333,0.089817,0.080236,0.083156,0.067928,0.095123,0.094297,0.099293
4,007. Kabhi To Nazar Milao = Kabhi To Nazar Mil...,99.384014,466.8,0.077911,0.002657,0.093776,0.001969,2036.564439,347653.1421,2244.002816,...,0.08619,0.0785,0.083523,0.084613,0.073009,0.074552,0.073247,0.087003,0.079202,0.072906


In [None]:
data.describe()

In [3]:

# DATA STANDARDISATION:
stdScaler = StandardScaler()
stdScaler.fit(x)
std_data = pd.DataFrame(stdScaler.transform(x), columns=data.loc[:,'tempo':].columns)
std_data.head()


Unnamed: 0,tempo,duration,rms_mean,rms_var,zcr_mean,zcr_var,cent_mean,cent_var,spec_bw_mean,spec_bw_var,...,perc_var03,perc_var04,perc_var05,perc_var06,perc_var07,perc_var08,perc_var09,perc_var10,perc_var11,perc_var12
0,0.573494,-0.014596,-0.92903,-0.508462,0.049387,-0.453817,0.252767,-0.597655,0.490839,-0.428628,...,-0.239131,0.533731,0.271378,1.655357,0.179744,-0.969579,0.660102,1.113903,1.389488,0.016132
1,-0.71411,0.299356,-1.048059,-0.235975,-1.235431,-0.552923,-0.859988,0.627249,-0.227072,1.715064,...,-0.770911,-0.712681,0.399433,-0.364745,0.755549,-0.09756,-1.777902,-0.240154,-1.113272,-0.135327
2,-0.71411,-0.275984,0.446608,0.047001,-0.898285,0.217267,-0.565409,0.63506,0.161201,0.906321,...,1.600225,0.478513,0.001791,1.914052,0.222917,1.545789,0.51169,1.194278,1.488802,-0.771762
3,0.573494,0.022677,-0.192104,-0.041367,1.80094,-0.289165,0.607759,-0.855614,-0.252974,-0.625462,...,0.840396,2.016294,1.307182,0.663541,-0.024746,0.070716,-0.989216,0.849939,0.996935,1.283986
4,-0.71411,0.933951,-0.907346,-0.66006,0.073899,-0.558713,0.074718,-0.751923,0.261137,-0.537538,...,0.241405,-0.192016,0.154808,0.253037,-0.583582,-0.619667,-0.599986,0.210418,-0.126923,-0.64961


In [None]:
std_data.describe()

In [4]:

Nstd_PC = 6
std_pca = PCA(n_components=Nstd_PC)
std_PCs = std_pca.fit_transform(std_data.values)
std_PCs = pd.DataFrame(data = std_PCs) 
std_PCs.head()


Unnamed: 0,0,1,2,3,4,5
0,-5.0483,5.986089,-0.325991,1.107762,0.440155,6.929225
1,-8.138771,-3.889754,3.488342,2.878647,0.534389,-3.191782
2,-1.849023,1.977019,-3.571198,-0.257715,-4.431498,0.662009
3,5.167924,8.968567,3.021486,-5.06204,4.292247,5.851826
4,-5.629903,0.830023,-1.753262,-0.348749,1.276269,-0.0766


In [None]:

wcss = []
for i in range(1, 20):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(std_PCs)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 20), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


In [5]:

kmeans = KMeans(n_clusters=20, init='k-means++', max_iter=300, n_init=10, random_state=0)
km = kmeans.fit(std_PCs)

cluster_map = pd.DataFrame()
cluster_map['song_name'] = data['song_name']
cluster_map['cluster'] = km.labels_
cluster_map.head()


Unnamed: 0,song_name,cluster
0,'Cos I'm your lady.mp3,7
1,-Songs-of-War-The-North-Remembers-Game-of-Thro...,13
2,004 Enna Sona.mp3,19
3,005 SWIKAR KIYA MAINE = CHAND KE PAAS JO.mp3,9
4,007. Kabhi To Nazar Milao = Kabhi To Nazar Mil...,19


In [8]:

clusters = [[] for _ in range(20)]
for i, song in enumerate(data['song_name']):
    clusters[km.labels_[i]].append(song)

import json
with open('clusters.json', 'w') as outfile:
    json.dump(clusters, outfile)


In [7]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(cluster_map.sort_values(by=['cluster','song_name']))

                                              song_name  cluster
6                01 - Besabriyaan - DownloadMing.SE.mp3        0
29                03 Ni Sultana Re - Pyar Ka Mausam.mp3        0
31                  033 AJNABEE = HUM DONO DO PREMI.mp3        0
37         041 THODI SI BEWAFAI = ANKHON MAIN HAMNE.mp3        0
46                     06 Rimjhim Ke Geet - Anjaana.mp3        0
51                          0DilHoomKare(L)Rudaali).mp3        0
54     0Too Is Tarah Se (Aap To Aise Na The) - Copy.mp3        0
63                  26 Tumhin Mere Mandir - Khandan.mp3        0
65            28 - Troye Sivan - Blue Moon w-Lyrics.mp3        0
67                   34 Main To Tum Sang - Manmauji.mp3        0
73    500_Miles_-_Inside_Llewyn_Davis-[AudioTrimmer....        0
76    55 Chale The Saath Milke - Haseena Man Jayegi.mp3        0
172                             AUD-20170619-WA0003.mp3        0
84    Aaj Bichhde Hain  Shaam-E-Ghazal  Thodi Si Bew...        0
85    Aaj Jaane Ki Zid Na