In [2]:
#importing the header files 
import os
import numpy as np
import pandas as pd
import IPython.display as ipd
from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 
import librosa
from sklearn import mixture
from numpy import unique
from numpy import where
from matplotlib import pyplot as plt
import librosa.display
from sklearn import metrics
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder

# CLUSTERING MODELS

In [3]:
def kmeans_model(train_data,n_clusters):
    kmean=sklearn.cluster.KMeans(init='k-means++', n_clusters=n_clusters,n_init=200, max_iter=1000000)
    kmean.fit(train_data)
    # assign a cluster to each example
    yhat = kmean.predict(train_data)
    # retrieve unique clusters
    clusters = unique(yhat)
    return yhat,clusters

In [4]:
def agglomerative_model(train_data,n_clusters):
    agg=sklearn.cluster.AgglomerativeClustering(n_clusters=n_clusters)
    # assign a cluster to each example
    yhat = agg.fit_predict(train_data)
    # retrieve unique clusters
    clusters = unique(yhat)
    return yhat,clusters

In [5]:
def birch_model(train_data,n_clusters):
    birch=sklearn.cluster.Birch(threshold=0.01, n_clusters=n_clusters)
    birch.fit(train_data)
    # assign a cluster to each example
    yhat = birch.predict(train_data)
    # retrieve unique clusters
    clusters = unique(yhat)
    return yhat,clusters

In [6]:
def spectral_model(train_data,n_clusters):
    spectral=sklearn.cluster.SpectralClustering(n_clusters=n_clusters)
    # assign a cluster to each example
    yhat = spectral.fit_predict(train_data)
    # retrieve unique clusters
    clusters = unique(yhat)
    return yhat,clusters

In [7]:
def dbscan_model(train_data):
    dbscan_model = sklearn.cluster.DBSCAN(eps = 0.5, min_samples = 20)
    yhat = dbscan_model.fit_predict(train_data)
    clusters=unique(yhat)
    return yhat,clusters
    

In [8]:
def mean_shift_model(train_data):
    bandwidth = sklearn.cluster.estimate_bandwidth(train_data, quantile=0.2, n_samples=500)
    mean_model=sklearn.cluster.MeanShift(bandwidth=bandwidth)
    yhat = mean_model.fit_predict(train_data)
    clusters=unique(yhat)
    return yhat,clusters
    

In [9]:
def optics_model(train_data):
    optics_model = sklearn.cluster.OPTICS(eps=0.5, min_samples=20)
   
    yhat = optics_model.fit_predict(train_data)
    clusters = unique(yhat)
    return yhat,clusters

In [10]:
def gaussian_model(train_data,n_clusters):
    gaussian_model=sklearn.mixture.GaussianMixture(n_components=n_clusters)
    # fit the model
    gaussian_model.fit(train_data)
    # assign a cluster to each example
    yhat = gaussian_model.predict(train_data)
    # retrieve unique clusters
    clusters = unique(yhat)
    
    return yhat,clusters

In [11]:
def pred_cluster_label(yhat,clusters,cluster_df,output_df,model):
    cluster_df['pred_label']=0
    for cluster in clusters:
        row_ix = where(yhat == cluster)
        row_ids=row_ix[0].tolist()
        cluster_df.loc[cluster_df.index.isin(row_ids),'pred_label']=cluster
    
    
    output_df.loc[model,'n_clusters']=len(clusters)
    output_df=evaluation_Score(cluster_df['actual_label_encode'],cluster_df['pred_label'],output_df,model)
    
    return output_df

In [12]:
def evaluation_Score(y_true,y_pred,output_df,model):
    """
    print("Adjusted Rank Index",metrics.adjusted_rand_score(y_true,y_pred))
    print("Mutual information based score",metrics.adjusted_mutual_info_score(y_true,y_pred))
    print("Homogeneity",metrics.homogeneity_score(y_true,y_pred))
    print("Completeness",metrics.completeness_score(y_true,y_pred))
    print("V-measure",metrics.v_measure_score(y_true,y_pred))
    print("Fowlkes-Mallows scores",metrics.fowlkes_mallows_score(y_true,y_pred))
    print("Accuracy score", metrics.accuracy_score(y_true, y_pred))
    print("Recall ",metrics.recall_score(y_true,y_pred))
    print("Precison ",metrics.precision_score(y_true,y_pred))
    """
    try:
        output_df.loc[model,'ARI'] =metrics.adjusted_rand_score(y_true,y_pred)
        output_df.loc[model,'MI'] = metrics.adjusted_mutual_info_score(y_true,y_pred)
        output_df.loc[model,'H'] = metrics.homogeneity_score(y_true,y_pred)
        output_df.loc[model,'C'] = metrics.completeness_score(y_true,y_pred)
        output_df.loc[model,'V'] = metrics.v_measure_score(y_true,y_pred)
        output_df.loc[model,'FM'] =metrics.fowlkes_mallows_score(y_true,y_pred)
        output_df.loc[model,'A']=metrics.accuracy_score(y_true, y_pred)
        output_df.loc[model,'R']=metrics.recall_score(y_true,y_pred)
        output_df.loc[model,'P']=metrics.precision_score(y_true,y_pred)

    except ValueError as e:
        output_df.loc[model,'R']=metrics.recall_score(y_true,y_pred,average='micro')
        output_df.loc[model,'P']=metrics.precision_score(y_true,y_pred,average='micro')

        
    
    return output_df
    
    
    
    
   
    

In [13]:
def get_muslim_labels(filename):
    df=pd.read_csv(filename)
    final_files=list(df['file_name'])
    
    df=pd.read_excel("D://Himani-work/gsoc2020/dataset/spreadsheet_data/muslim_concordance_250_annotated.xls")
    df.dropna(inplace=True)
    
    
    final_updated_files=[]
    for file in final_files:
        for i in range(len(df)):
            if(df.loc[i,'File Name']==file):
                if(df['u'][i]==1):
                    final_updated_files.append('u')
    
                elif(df['a'][i]==1):
                    final_updated_files.append('a')
    
                elif(df['misaligned/error/etc.'][i]==1):
                    final_updated_files.append('misaligned')
    
                elif(df['some other problem'][i]==1):
                    final_updated_files.append('other')
    
                elif(df['can\'t decide'][i]==1):
                    final_updated_files.append('cantdecide')
    
    
    le = LabelEncoder()
    le.fit(np.array(final_updated_files))

    encoded_labels=le.transform(np.array(final_updated_files))
    print("classes",le.classes_)
    return encoded_labels

In [14]:
def load_dataset(filename,normalization):
    df=pd.read_csv(filename)
    train_data=df.iloc[:,0:-1]
    if(normalization=='min-max'):
        scaler = MinMaxScaler()
        train_data = scaler.fit_transform(train_data)
        train_data=pd.DataFrame(train_data)
    
    elif(normalization=='standard'):
        scaler = StandardScaler()
        train_data = scaler.fit_transform(train_data)
        train_data=pd.DataFrame(train_data)
    
    #if ideology the use this 
    splited_df=df['file_name'].str.split("clip_",expand=True)
    labels=splited_df[1].str.split("_",expand=True)
    actual_labels=labels[1]
    
    """
    #if muslim data then do this 
    actual_labels=get_muslim_labels(filename)
    """
    return train_data,actual_labels
    
    

In [15]:
def run_model(train_data,actual_labels,output_df):
    cluster_df=pd.DataFrame()

    
    
    #if ideology then use this 
    cluster_df['actual_label']=list(actual_labels)
    cluster_df['actual_label_encode']=1
    cluster_df.loc[cluster_df['actual_label']=='ee','actual_label_encode']=0
    cluster_df.loc[cluster_df['actual_label']=='ai','actual_label_encode']=1
    
    """
    #if muslim dataset use this 
    cluster_df['actual_label_encode']=actual_labels
    """
    
    print("K-means results")
    yhat,clusters=kmeans_model(train_data,2)
    output_df=pred_cluster_label(yhat,clusters,cluster_df,output_df,'K-Means')
    
    
    print("Agglomerative clustering results")
    yhat,clusters=agglomerative_model(train_data,2)
    output_df=pred_cluster_label(yhat,clusters,cluster_df,output_df,'Agglomerative clustering')
   
    print("Birch")
    yhat,clusters=birch_model(train_data,2)
    output_df=pred_cluster_label(yhat,clusters,cluster_df,output_df,'Birch')
    
   
    print("DBSCAN")
    yhat,clusters=dbscan_model(train_data)
    output_df=pred_cluster_label(yhat,clusters,cluster_df,output_df,'DBSCAN')
    
    print("Mean shift")
    yhat,clusters=mean_shift_model(train_data)
    output_df=pred_cluster_label(yhat,clusters,cluster_df,output_df,'Mean-shift')
    
    
    print("Optics")
    yhat,clusters=optics_model(train_data)
    output_df=pred_cluster_label(yhat,clusters,cluster_df,output_df,'Optics')
    
    print("Gaussian Mixture")
    yhat,clusters=gaussian_model(train_data,2)
    output_df=pred_cluster_label(yhat,clusters,cluster_df,output_df,'Gaussian-mixture')
    
    #print("Spectral")
    #yhat,clusters=spectral_model(train_data,2)
    #output_df=pred_cluster_label(yhat,clusters,cluster_df,output_df,'Spectral-clustering')
    
    
    
    
    return output_df




In [16]:
def pca_transform(train_data):
    pca = PCA(n_components = 2) 
    X_principal = pca.fit_transform(train_data) 
    X_principal = pd.DataFrame(X_principal) 
    X_principal.columns = ['P1', 'P2'] 
    return X_principal
    

In [17]:
def tsne_transform(train_data):
    tsne = TSNE(n_components=2)
    X_principal=tsne.fit_transform(train_data)
    X_principal = pd.DataFrame(X_principal) 
    X_principal.columns = ['P1', 'P2'] 
    return X_principal

In [18]:
def runClustering(filename,dimensionality=None):
    train_data,actual_labels=load_dataset(filename,'standard')
    
    cluster_df=pd.DataFrame()
    output_df = pd.DataFrame(index=['K-Means','Agglomerative clustering','Birch','DBSCAN','Mean-shift','Optics','Gaussian-mixture'],columns=['n_clusters','ARI','MI','H','C','V','FM','A','R','P'])
    
    if(dimensionality==None):
        output_df=run_model(train_data,actual_labels,output_df)
    
    elif(dimensionality=='pca'):
        train_data_transform=pca_transform(train_data)
        output_df=run_model(train_data_transform,actual_labels,output_df)
    
    elif(dimensionality=='tsne'):
        train_data_transform=tsne_transform(train_data)
        output_df=run_model(train_data_transform,actual_labels,output_df)
   
    return output_df
        
    

In [27]:
output_df_ideology_word=runClustering('mfcc0_ideology_five_features_word.csv')
output_df_ideology_word

K-means results
Agglomerative clustering results
Birch
DBSCAN
Mean shift
Optics
Gaussian Mixture


Unnamed: 0,n_clusters,ARI,MI,H,C,V,FM,A,R,P
K-Means,2,-0.000691441,0.000812674,0.00177892,0.00085096,0.00115122,0.638404,0.501559,0.509266,0.887021
Agglomerative clustering,2,-0.0259513,0.00283869,0.00430432,0.00258601,0.00323091,0.711874,0.307239,0.247876,0.925072
Birch,2,0.00496477,0.00332494,0.00565209,0.00270963,0.00366314,0.641575,0.456529,0.458301,0.87731
DBSCAN,1,0.0,1.60559e-15,2.51292e-16,1.0,5.02584e-16,0.902969,0.0,0.0,0.0
Mean-shift,4,0.00870728,-0.00128863,0.000785446,0.00372687,0.00129745,0.893653,0.102529,0.102529,0.102529
Optics,3,-0.0324598,0.00897184,0.00723311,0.0200218,0.0106271,0.879482,0.00865951,0.00865951,0.00865951
Gaussian-mixture,2,-0.00510855,0.00194543,0.0035068,0.00169579,0.00228609,0.641449,0.464496,0.445174,0.913629


In [32]:
print("results of ideology word ")
output_df_ideology_word

results of ideology word 


Unnamed: 0,n_clusters,ARI,MI,H,C,V,FM,A,R,P
K-Means,2,-0.000691441,0.000812674,0.00177892,0.00085096,0.00115122,0.638404,0.501559,0.509266,0.887021
Agglomerative clustering,2,-0.0259513,0.00283869,0.00430432,0.00258601,0.00323091,0.711874,0.307239,0.247876,0.925072
Birch,2,0.00496477,0.00332494,0.00565209,0.00270963,0.00366314,0.641575,0.456529,0.458301,0.87731
DBSCAN,1,0.0,1.60559e-15,2.51292e-16,1.0,5.02584e-16,0.902969,0.0,0.0,0.0
Mean-shift,4,0.00870728,-0.00128863,0.000785446,0.00372687,0.00129745,0.893653,0.102529,0.102529,0.102529
Optics,3,-0.0324598,0.00897184,0.00723311,0.0200218,0.0106271,0.879482,0.00865951,0.00865951,0.00865951
Gaussian-mixture,2,-0.00510855,0.00194543,0.0035068,0.00169579,0.00228609,0.641449,0.464496,0.445174,0.913629


In [29]:
output_df_ideology_vowel=runClustering('mfcc0_ideology_five_features_vowel.csv')
output_df_ideology_vowel

K-means results
Agglomerative clustering results
Birch
DBSCAN
Mean shift
Optics
Gaussian Mixture


Unnamed: 0,n_clusters,ARI,MI,H,C,V,FM,A,R,P
K-Means,2,0.0463458,0.0686315,0.106183,0.0510476,0.0689481,0.658866,0.613093,0.587645,0.96881
Agglomerative clustering,2,0.413724,0.213704,0.222335,0.206456,0.214102,0.886181,0.102182,0.0637066,0.496988
Birch,2,0.426168,0.223113,0.213596,0.234457,0.223541,0.90014,0.0907516,0.0432432,0.432432
DBSCAN,1,0.0,1.60559e-15,2.51292e-16,1.0,5.02584e-16,0.902969,0.0,0.0,0.0
Mean-shift,2,0.0555602,0.025089,0.0147516,0.113515,0.0261102,0.902427,0.101836,0.0034749,0.428571
Optics,3,-0.0341463,0.00953937,0.0076981,0.0203393,0.0111689,0.87801,0.0148944,0.0148944,0.0148944
Gaussian-mixture,2,0.320127,0.18661,0.238548,0.153695,0.186944,0.823397,0.840665,0.851351,0.967105


In [33]:
print("results of ideology vowel ")
output_df_ideology_vowel

results of ideology vowel 


Unnamed: 0,n_clusters,ARI,MI,H,C,V,FM,A,R,P
K-Means,2,0.0463458,0.0686315,0.106183,0.0510476,0.0689481,0.658866,0.613093,0.587645,0.96881
Agglomerative clustering,2,0.413724,0.213704,0.222335,0.206456,0.214102,0.886181,0.102182,0.0637066,0.496988
Birch,2,0.426168,0.223113,0.213596,0.234457,0.223541,0.90014,0.0907516,0.0432432,0.432432
DBSCAN,1,0.0,1.60559e-15,2.51292e-16,1.0,5.02584e-16,0.902969,0.0,0.0,0.0
Mean-shift,2,0.0555602,0.025089,0.0147516,0.113515,0.0261102,0.902427,0.101836,0.0034749,0.428571
Optics,3,-0.0341463,0.00953937,0.0076981,0.0203393,0.0111689,0.87801,0.0148944,0.0148944,0.0148944
Gaussian-mixture,2,0.320127,0.18661,0.238548,0.153695,0.186944,0.823397,0.840665,0.851351,0.967105
