In [83]:
import pandas as pd
import math
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
import numpy as np

In [84]:
dataset = pd.read_csv('Frogs_MFCCs.csv')
dataset=dataset.drop(dataset.columns[[0, 22, 23, 24, 25]], axis=1)

In [85]:
#method to clean the dataset
#based on the datatype of the attribute, the NaN values are replaced with standard values
#if datatype is object, replaced with empty string
#if datatype is int64, replaced with rounded off mean value
#if datatype is float64, replaced with meanvalue
def cleanDataFrame(data):
    for i in range(0, data.shape[1]):
        colValues = data.iloc[:,i:i+1].iloc[:,0]
        colType = data.iloc[:,i:i+1].iloc[:,0].dtype
        if colType == 'int64':
            data.iloc[:,i:i+1] = data.iloc[:,i:i+1].fillna(round(colValues.mean()))
        elif colType == 'float64':
            data.iloc[:,i:i+1] = data.iloc[:,i:i+1].fillna(colValues.mean())        
        elif colType == 'object':
            data.iloc[:,i:i+1] = data.iloc[:,i:i+1].fillna('')

In [86]:
cleanDataFrame(dataset)

In [87]:
#changing object attributes to categorical attributes 
for header in list(dataset):
    if(dataset[header].dtypes=='O'):
        dataset[header] = dataset[header].astype('category').cat.codes       

In [88]:
#centroid of the dataset
centroid = dataset.mean().to_frame().T

In [89]:
#Methood to find the total sum of squares for the dataset D and centroid C
def findTSSforD(D, C):
    mTSS = 0
    for index, row in D.iterrows():
        rowsum = 0
        for header in list(D):            
            rowsum += math.pow((row[header]-C[header]), 2)                 
        mTSS += rowsum
    return mTSS    

In [90]:
#total sum of squares of the data set
TSS = findTSSforD(dataset, centroid)
print(TSS)

3693.2580926313963


In [91]:
print('K-Means Clustering')
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(dataset)
    kTSS = [None]*k
    dataset['cluster'] = kmeans.labels_
    for clustNum in range(0, k):
        centroidForClust = pd.DataFrame(kmeans.cluster_centers_[clustNum]).T.iloc[:,0:21]             
        centroidForClust.columns = list(dataset.iloc[:,0:21])        
        kTSS[clustNum] = findTSSforD(dataset[dataset['cluster']==clustNum].iloc[:,0:21], centroidForClust)
    TWSS = sum(kTSS) 
    print('For k=',k, 'total within sum of squares/total sum of squares=', TWSS/TSS)     

K-Means Clustering
For k= 1 total within sum of squares/total sum of squares= 1.0000000000000002
For k= 2 total within sum of squares/total sum of squares= 0.6584850467058518
For k= 3 total within sum of squares/total sum of squares= 0.5303531912183984
For k= 4 total within sum of squares/total sum of squares= 0.46660132577458086
For k= 5 total within sum of squares/total sum of squares= 0.42089511187065076
For k= 6 total within sum of squares/total sum of squares= 0.388914282168457
For k= 7 total within sum of squares/total sum of squares= 0.35789363101678234
For k= 8 total within sum of squares/total sum of squares= 0.33060144345315245
For k= 9 total within sum of squares/total sum of squares= 0.3094970593322418
For k= 10 total within sum of squares/total sum of squares= 0.28956087743177633


In [92]:
print('H-Clustering')
for k in range(1, 11):
    hcluster = AgglomerativeClustering(n_clusters=k)
    hcluster.fit(dataset)
    kTSS = [None]*k
    dataset['cluster'] = hcluster.labels_
    for clustNum in range(0, k):        
        kTSS[clustNum] = findTSSforD(dataset[dataset['cluster']==clustNum].iloc[:,0:21], dataset[dataset['cluster']==clustNum].iloc[:,0:21].mean().to_frame().T)
    TWSS = sum(kTSS) 
    print('For k=', k, ' total within sum of squares/total sum of squares=', TWSS/TSS)

H-Clustering
For k= 1  total within sum of squares/total sum of squares= 1.0
For k= 2  total within sum of squares/total sum of squares= 0.6627921553377006
For k= 3  total within sum of squares/total sum of squares= 0.5549805607077856
For k= 4  total within sum of squares/total sum of squares= 0.48130348442994947
For k= 5  total within sum of squares/total sum of squares= 0.4267721055557141
For k= 6  total within sum of squares/total sum of squares= 0.38856566261423087
For k= 7  total within sum of squares/total sum of squares= 0.3622004054856775
For k= 8  total within sum of squares/total sum of squares= 0.3389011165136607
For k= 9  total within sum of squares/total sum of squares= 0.31641535950554867
For k= 10  total within sum of squares/total sum of squares= 0.30056773025965355


In [93]:
print('Gaussian Mixture Models')
for k in range(1, 11):
    gmm = GaussianMixture(n_components=k)
    gmm.fit(dataset)
    kTSS = [None]*k
    dataset['cluster'] = gmm.predict(dataset)
    for clustNum in range(0, k):        
        kTSS[clustNum] = findTSSforD(dataset[dataset['cluster']==clustNum].iloc[:,0:21], dataset[dataset['cluster']==clustNum].iloc[:,0:21].mean().to_frame().T)
    TWSS = sum(kTSS) 
    print('For k=', k, ' total within sum of squares/total sum of squares=', TWSS/TSS)

Gaussian Mixture Models
For k= 1  total within sum of squares/total sum of squares= 1.0
For k= 2  total within sum of squares/total sum of squares= 0.6972745750248325
For k= 3  total within sum of squares/total sum of squares= 0.6134034859977475
For k= 4  total within sum of squares/total sum of squares= 0.5632981136742243
For k= 5  total within sum of squares/total sum of squares= 0.6598766270051774
For k= 6  total within sum of squares/total sum of squares= 0.4660743632868852
For k= 7  total within sum of squares/total sum of squares= 0.41203341053823545
For k= 8  total within sum of squares/total sum of squares= 0.3967960618127207
For k= 9  total within sum of squares/total sum of squares= 0.3899079998749755
For k= 10  total within sum of squares/total sum of squares= 0.3759276633997933
