In [157]:
import pandas as pd
import math
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
import numpy as np

In [158]:
dataset = pd.read_csv('dataset_Facebook.csv',sep=';')

In [159]:
#method to clean the dataset
#based on the datatype of the attribute, the NaN values are replaced with standard values
#if datatype is object, replaced with empty string
#if datatype is int64, replaced with rounded off mean value
#if datatype is float64, replaced with meanvalue
def cleanDataFrame(data):
    for i in range(0, data.shape[1]):
        colValues = data.iloc[:,i:i+1].iloc[:,0]
        colType = data.iloc[:,i:i+1].iloc[:,0].dtype
        if colType == 'int64':
            data.iloc[:,i:i+1] = data.iloc[:,i:i+1].fillna(round(colValues.mean()))
        elif colType == 'float64':
            data.iloc[:,i:i+1] = data.iloc[:,i:i+1].fillna(colValues.mean())        
        elif colType == 'object':
            data.iloc[:,i:i+1] = data.iloc[:,i:i+1].fillna('')        

In [160]:
#cleaning both test and train data by calling the defined function
cleanDataFrame(dataset)

In [161]:
for header in list(dataset):
    if(dataset[header].dtypes=='O'):
        dataset[header] = dataset[header].astype('category').cat.codes

In [162]:
centroid = dataset.mean().to_frame().T
centroid

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
0,123194.176,1.074,1.88,7.038,4.15,7.84,0.278557,13903.36,29585.948,920.344,798.772,1415.13,16766.376,6585.488,609.986,7.482,177.945892,27.266129,212.12


In [163]:
#Methood to find the total sum of squares for the dataset D and centroid C
def findTSSforD(D, C):
    mTSS = 0
    for index, row in D.iterrows():
        rowsum = 0
        for header in list(D):   
            rowsum += math.pow((row[header]-C[header]), 2)                 
        mTSS += rowsum
    return mTSS    

In [164]:
TSS = findTSSforD(dataset, centroid)
print(TSS)

5150201201513.516


In [165]:
print('K-Means Clustering')
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(dataset)
    kTSS = [None]*k
    dataset['cluster'] = kmeans.labels_
    for clustNum in range(0, k):
        centroidForClust = pd.DataFrame(kmeans.cluster_centers_[clustNum]).T.iloc[:,0:19]             
        centroidForClust.columns = list(dataset.iloc[:,0:19])        
        kTSS[clustNum] = findTSSforD(dataset[dataset['cluster']==clustNum].iloc[:,0:19], centroidForClust)
    TWSS = sum(kTSS) 
    print('For k=', k, ' total within sum of squares/total sum of squares=', TWSS/TSS)     

K-Means Clustering
For k= 1  total within sum of squares/total sum of squares= 1.0000000000000002
For k= 2  total within sum of squares/total sum of squares= 0.4218295697372187
For k= 3  total within sum of squares/total sum of squares= 0.20559159729380097
For k= 4  total within sum of squares/total sum of squares= 0.13893094177631723
For k= 5  total within sum of squares/total sum of squares= 0.09925650124223634
For k= 6  total within sum of squares/total sum of squares= 0.07205460575989185
For k= 7  total within sum of squares/total sum of squares= 0.054014971304570954
For k= 8  total within sum of squares/total sum of squares= 0.040997314987684656
For k= 9  total within sum of squares/total sum of squares= 0.03427696459958252
For k= 10  total within sum of squares/total sum of squares= 0.028749153052166847


In [166]:
print('H-Clustering')
for k in range(1, 11):
    hcluster = AgglomerativeClustering(n_clusters=k)
    hcluster.fit(dataset)
    kTSS = [None]*k
    dataset['cluster'] = hcluster.labels_
    for clustNum in range(0, k):        
        kTSS[clustNum] = findTSSforD(dataset[dataset['cluster']==clustNum].iloc[:,0:19], dataset[dataset['cluster']==clustNum].iloc[:,0:19].mean().to_frame().T)
    TWSS = sum(kTSS)
    print('For k=', k, ' total within sum of squares/total sum of squares=', TWSS/TSS)

H-Clustering
For k= 1  total within sum of squares/total sum of squares= 1.0
For k= 2  total within sum of squares/total sum of squares= 0.4218295697372187
For k= 3  total within sum of squares/total sum of squares= 0.20988978791170604
For k= 4  total within sum of squares/total sum of squares= 0.1406854628175006
For k= 5  total within sum of squares/total sum of squares= 0.10101102228341968
For k= 6  total within sum of squares/total sum of squares= 0.07297862905979668
For k= 7  total within sum of squares/total sum of squares= 0.054952118013899674
For k= 8  total within sum of squares/total sum of squares= 0.045891171448596155
For k= 9  total within sum of squares/total sum of squares= 0.03697644221153005
For k= 10  total within sum of squares/total sum of squares= 0.030236784707162036


In [167]:
print('Gaussian Mixture Models')
for k in range(1, 11):
    gmm = GaussianMixture(n_components=k, reg_covar=1e-4)
    gmm.fit(dataset)
    #print(gmm.means_)
    kTSS = [None]*k
    dataset['cluster'] = gmm.predict(dataset)
    for clustNum in range(0, k):        
        kTSS[clustNum] = findTSSforD(dataset[dataset['cluster']==clustNum].iloc[:,0:19], dataset[dataset['cluster']==clustNum].iloc[:,0:19].mean().to_frame().T)
    TWSS = sum(kTSS) 
    print('For k=', k, ' total within sum of squares/total sum of squares=', TWSS/TSS)

Gaussian Mixture Models
For k= 1  total within sum of squares/total sum of squares= 1.0
For k= 2  total within sum of squares/total sum of squares= 0.4218295697372187
For k= 3  total within sum of squares/total sum of squares= 0.21479997257237857
For k= 4  total within sum of squares/total sum of squares= 0.21276438608632445
For k= 5  total within sum of squares/total sum of squares= 0.12724563299645558
For k= 6  total within sum of squares/total sum of squares= 0.11825720086491838
For k= 7  total within sum of squares/total sum of squares= 0.07395488845928191
For k= 8  total within sum of squares/total sum of squares= 0.06806746016547731
For k= 9  total within sum of squares/total sum of squares= 0.05144811636386741
For k= 10  total within sum of squares/total sum of squares= 0.04638378817334704
