In [None]:
from glob import glob
from PIL import Image
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import scipy
import sklearn
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split,KFold
import matplotlib.pyplot as plt
import seaborn as sns
import time
import operator
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, fbeta_score,make_scorer, mutual_info_score, silhouette_score, normalized_mutual_info_score, classification_report, confusion_matrix,f1_score, mean_squared_error, adjusted_rand_score
import time 
from sklearn.model_selection import LearningCurveDisplay
import tensorflow_addons as tfa
from NNnet_class import NNnet
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA,FastICA
from sklearn.random_projection import johnson_lindenstrauss_min_dim,SparseRandomProjection,GaussianRandomProjection
from sklearn.manifold import TSNE,trustworthiness,Isomap
from A3_utils import calculate_wcss,plot_gallery, categorizer, create_elbow_plot,experiment_em_clusters,experiment_km_clusters
from kmodes.kmodes import KModes
import umap
from matplotlib.offsetbox import (AnnotationBbox, DrawingArea, OffsetImage,
                                  TextArea)

In [None]:
import warnings
from scipy.sparse import SparseEfficiencyWarning
# Ignore SparseEfficiencyWarning
warnings.simplefilter('ignore', category=SparseEfficiencyWarning)

In [None]:
#Seeting seed
np.random.seed(42)

#Making accuracy scorrer
accuracy_scorer=make_scorer(accuracy_score)

#Global Beta (Used for Ftwo score - not used in assignment)
BETA=2

In [None]:
def create_param_curve(data,param,param_vals,ax,algorithim_name,algorithim,metric=accuracy_scorer,metric_name='Accuracy',graph=True,beta=BETA,cv=3):

    '''Function to create parameter curves
    
    Parameters:
    data (list): List of np.arrays containing the features and labels  
    param (str): Parameter to vary
    param_vals (list): Parameter values
    ax (matplotlib.axes): Axis for graph
    algorithim (sklearn.algorithim/NNnet): Algorithim to vary parameter in
    metric (sklearn.metric): Metric to score algorotihim on
    metric_name (str): Metric Name
    graph (bool): Bool for validation graph
    beta (int): Beta value for the Ftwo scored 
    cv (int): The number of cross validations folds
    
    Returns:
    None'''


    train_acc=[]
    test_acc=[]
    
    for i in param_vals:
        #Crossvalidating each parameter value
        kf=KFold(n_splits=cv,shuffle=True)

        internal_train_accuracy=0
        internal_test_accuracy=0
        
        for train,test in kf.split(X=data[0]):
            if i!='Default':
                if 'net' in algorithim_name.lower():
                    kwargs={param:i,'input_dims':data[0].shape[-1]}
                    clf=algorithim(**kwargs)
                else:
                    clf=algorithim(**{param:i})
            else:
                clf=algorithim()

            clf.fit(data[0][train],data[1][train])
            internal_train_accuracy+=metric(y_pred=clf.predict(data[0][train]),y_true=data[1][train])
            internal_test_accuracy+=metric(y_pred=clf.predict(data[0][test]),y_true=data[1][test])

        train_acc.append(internal_train_accuracy/cv)
        test_acc.append(internal_test_accuracy/cv)

    best_val=param_vals[np.argmax(test_acc)]

    if type(param_vals[0])==str:
        ax.scatter(param_vals,train_acc,label='Training {}'.format(metric_name))
        ax.scatter(param_vals,test_acc,label='Validation {}'.format(metric_name))
    else:
        ax.plot(param_vals,train_acc,label='Training {}'.format(metric_name))
        ax.plot(param_vals,test_acc,label='Validation {}'.format(metric_name))
    #plt.xscale('log')
    ax.axvline(best_val,label='Best {} Value'.format(param),color='red',linestyle = '--')
    ax.legend()
    ax.set_ylabel('{}'.format(metric_name));
    ax.set_xlabel(param);
    ax.set_title('{} vs {}'.format(param,metric_name));
    if type(best_val)!=str:
        if best_val==max(param_vals):
            ax.text(y=(max(test_acc)+min(test_acc))/2+np.abs(np.std(test_acc)),x=best_val-np.std(param_vals)/12,s=best_val,color='green',weight='bold')
        else:
            ax.text(y=(max(test_acc)+min(test_acc))/2+np.abs(np.std(test_acc)),x=best_val+np.std(param_vals)/12,s=best_val,color='green',weight='bold')

In [None]:
def deal_algorithim(data,param_dicts,dataset,algorithim_name,algorithim,metric=accuracy_score,metric_name='Accuracy',cv=3):
    
    '''Function to deal with algorithim
    
    Parameters:
    data (list): List of np.arrays containing the features and labels
    param_dict (dict): Parameter dictionary to vary
    dataset (str): Dataset name
    algorithim_name (str): Algorithim name
    algorithim (sklearn.algorithim/NNnet): Algorithim to vary parameter in
    metric (sklearn.metric): Metric to score algorotihim on
    metric_name (str): Metric Name
    cv (int): The number of cross validations folds 

    Returns:
    None'''
    
    num_classes=len(param_dicts.keys())

    #Getting Fig Size
    fig,axes=plt.subplots(2,int(np.ceil(num_classes/2)))
    fig.set_size_inches(15,15)
    i=-1
    for c,ax in enumerate(fig.axes):
        i+=1
        plt.suptitle('Results for Algorithim: "{}" for "{}" Dataset'.format(algorithim_name,dataset),fontsize=18)
        param=list(param_dicts.keys())[i]
        param_vals=param_dicts[param]
        create_param_curve(data,param,param_vals,ax,algorithim_name,algorithim,metric,metric_name)
    plt.tight_layout()

In [None]:
#Load Heart Disease Data
def load_heart_data():

    '''Load Heart Disease Dataset
    
    Returns:
    X (np.array): X array
    Y (np.array): Y array
    col_index (dict): Dictionary containing the pairing for the column location and it's name'''

    #PLEASE CHANGE TO LOCATION OF YOUR HEART DATA
    df=pd.read_csv('Data/Heart_2/heart.csv')
    Y=np.array(df['HeartDisease'])
    df.drop('HeartDisease',axis=1,inplace=True)
    
    label_columns=['HeartDisease']
    categorical_columns=['Sex', 'ChestPainType', 'RestingECG','ExerciseAngina','ST_Slope']

    non_categorical_variables=list(set(df.columns).difference(set(categorical_columns+label_columns)))
    X=np.array(df[non_categorical_variables])
    columns_categorized=non_categorical_variables

    #Now we need to one hot vectorize the type_of_meal_plan, room_type_reserved and market_segment_type
    label_dict={}
    for i in categorical_columns:
        label_dict[i]=OneHotEncoder()
        res=label_dict[i].fit_transform(np.array(df[i]).reshape(-1,1)).toarray()
        X=np.c_[X,res]
        columns_categorized=columns_categorized+[i+'%'+j for j in ['1','2','3','4','5','6','7'][:res.shape[-1]]]

        col_index={}
        results_corr={}
        for label,col in zip(columns_categorized,range(X.shape[-1])):
            corr=scipy.stats.pearsonr(X[:,col],Y)[0]
            results_corr[label]=corr
            col_index[label]=col
    return X,Y,col_index

#Fashion MNIST
def load_fmnist():
    #Loading dataset
    data=pd.concat([pd.read_csv('Data/FMNIST/fashion-mnist_train.csv'),pd.read_csv('Data/FMNIST/fashion-mnist_test.csv')],axis=0)

    X=data.iloc[:,1:].to_numpy()
    Y=data.iloc[:,:1].to_numpy().ravel()

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1,random_state=42,stratify=Y)

    return X_test,y_test


#Load Pokemon Data
def load_pokemon():
    directory='Data/Pokemon/'
    image_files = glob(os.path.join(directory, '**', '*.*'), recursive=True)

    # Initialize a list to store the images as NumPy arrays
    features=[]
    labels=[]

    # Iterate over the Image files
    for file in image_files:

        #Name of pokemon
        label=file.split('\\')[1]

        #Appending name
        labels.append(label)

        #Reading images
        image = Image.open(file)

        #Converting to RGB
        image = image.convert('RGB')

        #Removing color
        image=image.convert('L')
        
        #Resizing images
        image=image.resize((48, 48))

        #Appending images
        features.append(np.array(image).flatten())

    #Converting to Numpy
    labels=np.array(labels)
    features=np.array(features)

    pokemon_names=LabelEncoder()
    labels_encoded=pokemon_names.fit_transform(labels)

    return features,labels_encoded,pokemon_names

In [None]:
def score_algorithim(X,Y,dataset_name,algorithim_name,algorithim,params,predictor_metric=accuracy_score,grid_search_metric=accuracy_scorer,predictor_metric_name='accuracy',return_needed=False):

    '''Function to score algorithim
    
    Parameters:
    X (np.array): All features
    Y (np.array): All labels
    dataset_name (str): Dataset name
    algorithim_name (str): Algorithim name
    algorithim (sklearn.algorithim/NNnet): Algorithim being experimented on
    params (dict): Parameter dictionary for GridSearch
    predictor_metric (sklearn.metric): Metric to score algorotihim on
    grid_search_metric (sklearn.metric): Metric used by GridSearchCV
    predictor_metric_name (str): Metric Name for predictor metric
    return_needed (bool): Bool if fit model needed to be returned    

    Returns:
    glf (sklearn.model/NNnet): Fit model'''

    standardize=False
    if 'KNN' in algorithim_name.upper() or 'SVM' in algorithim_name.upper() or 'NET' in algorithim_name.upper():
        standardize=True
    
    if 'NET' in algorithim_name.upper():
        n_jobs=1
    else:
        n_jobs=-1


    train,test=split_data(X,Y,valid=False,standardize=standardize)

    glf_cv=GridSearchCV(algorithim,param_grid=params,verbose=0,n_jobs=n_jobs,cv=3,scoring=grid_search_metric,refit=True)

    rep=10
    start_time=time.time()
    glf_cv.fit(train[0],train[1])

    glf=glf_cv.best_estimator_
    
    for i in range(rep):
        glf.fit(train[0],train[1])
    time_delay_train=(time.time()-start_time)/rep

    y_train_pred=glf.predict(train[0])
    
    if 'F' in predictor_metric_name.upper():
        train_score = predictor_metric(y_pred=glf.predict(train[0]),y_true=train[1],beta=BETA)
        test_score = predictor_metric(y_pred=glf.predict(test[0]),y_true=test[1],beta=BETA)
    else:
        train_score = predictor_metric(y_pred=glf.predict(train[0]),y_true=train[1])
        test_score = predictor_metric(y_pred=glf.predict(test[0]),y_true=test[1])

    start_time=time.time()
    for i in range(rep):
        y_test_pred=glf.predict(test[0])
    time_delay_infer=(time.time()-start_time)/rep

    print('{} Final Results'.format(dataset_name))
    print('\n')
    print(glf_cv.best_params_)
    print('\n')
    print('Average time to train the ideal {} was {:.3f} seconds'.format(algorithim_name,time_delay_train))
    print('Average time to infer the ideal {} was {:.3f} seconds'.format(algorithim_name,time_delay_infer))
    print('\n')
    print('The result on the training data for the ideal {} algorithim is a {} {} score'.format(algorithim_name,train_score,predictor_metric_name))
    print('The result on the test data for the ideal {} algorithim is a {} {} score'.format(algorithim_name,test_score,predictor_metric_name))
    
    if return_needed:
        return glf

In [None]:
def split_data(X,Y,valid=True,standardize=False):

    '''
    Split the data between train, test and optional validation dataset

    Parameters:
    X (np.array): X features
    Y (np.rray): Labels
    valid (bool): Split into validation dataset 
    standardize (bool): Whether to standardize the data (introduces bias as Sklearn Standard Scaler is trained only on the train data)

    Returns:
    train (list): np.array list of train
    valid (list): optional np.array list of validation
    test (list): np.array list of test
    '''
    
    #Now let's split the data between test and train, we'll use the standard 80/20 split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=42)
    
    if valid:
        #We'll also split the data between train and validation, we'll again use the standard 80/20 split
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2,random_state=42)
        
        if standardize:
            sklr=StandardScaler()
            X_train=sklr.fit_transform(X_train)
            X_valid=sklr.transform(X_valid)
            X_test=sklr.transform(X_test)
        return [X_train,y_train],[X_valid,y_valid],[X_test,y_test]

    if standardize:
        sklr=StandardScaler()
        X_train=sklr.fit_transform(X_train)
        X_test=sklr.transform(X_test)
    return [X_train,y_train],[X_test,y_test]

In [None]:
#Pokemon
X_poke,Y_poke,poke_encoder=load_pokemon()
train_poke,test_poke=split_data(X_poke,Y_poke,valid=False)

#Heart
X_heart,Y_heart,heart_index=load_heart_data()
train_heart,test_heart=split_data(X_heart,Y_heart,valid=False)

#Fashion MNIST
X_fm,Y_fm=load_fmnist()
train_fm,test_fm=split_data(X_fm,Y_fm,valid=False)

In [None]:
#Standardizing
sklr_poke=StandardScaler()
sklr_heart=StandardScaler()
sklr_fm=StandardScaler()

X_poke_scaler=StandardScaler()
X_heart_scaler=StandardScaler()
X_fm_scaler=StandardScaler()

#poke
X_poke_scaled=X_poke_scaler.fit_transform(X_poke)

train_poke_standardized=train_poke.copy()
train_poke_unstandardized=test_poke.copy()

test_poke_standardized=test_poke.copy()

train_poke_standardized[0]=sklr_poke.fit_transform(train_poke[0])
test_poke_standardized[0]=sklr_poke.transform(test_poke[0])

#Heart
X_heart_scaled=X_heart_scaler.fit_transform(X_heart)

train_heart_standardized=train_heart.copy()

test_heart_standardized=test_heart.copy()

train_heart_standardized[0]=sklr_heart.fit_transform(train_heart[0])
test_heart_standardized[0]=sklr_heart.transform(test_heart[0])

#FMNIST
X_fm_scaled=X_fm_scaler.fit_transform(X_fm)

train_fm_standardized=train_fm.copy()
train_fm_unstandardized=test_fm.copy()

test_fm_standardized=test_fm.copy()

train_fm_standardized[0]=sklr_fm.fit_transform(train_fm[0])
test_fm_standardized[0]=sklr_fm.transform(test_fm[0])

# Clustering ALgorithms on Reduced Datasets

## FMINST

In [None]:
DATASET=X_fm_scaled
LABELS=Y_fm

### PCA

In [None]:
chosen_n_pca_d1=95

#Creating PCA
d1_pca=PCA(n_components=chosen_n_pca_d1,random_state=21)

#Transforming Data
dataset_transformed_PCA=d1_pca.fit_transform(DATASET)

##### EM

In [None]:
#Testing EM algorithm with different number of components
dataset_name='FMNIST PCA'
dataset=dataset_transformed_PCA
labels_og=LABELS
algorithm_name='EM'

experiment_em_clusters(dataset=dataset,dataset_name=dataset_name,labels_og=labels_og,algorithm_name=algorithm_name)

In [None]:
#Evaluating Best Clusters
#best_clusters_PCA_d1=n_list[np.argmin(BIC_scores)]
dataset=dataset_transformed_PCA

#Selecting N based on Plots
best_clusters_PCA_d1=13
mi_score=0
aj_score=0
repeats=5

for i in range(repeats):
    best_em_pca_d1=GaussianMixture(n_components=best_clusters_PCA_d1,random_state=21*i)

    best_em_pca_d1.fit(dataset)
    mi_score+=normalized_mutual_info_score(LABELS,best_em_pca_d1.predict(dataset))/repeats
    aj_score+=adjusted_rand_score(LABELS,best_em_pca_d1.predict(dataset))/repeats


#Evaluation Normalized MI Score
print(f'The Normalized Mutual Info Score for the best clustering iS {mi_score}') 

#Evaluating Adjusted Rand Score Rand Score
print(f'The Adjusted Rand Score for the best clustering iS {aj_score}') 

In [None]:
#Visualizing
tsne_vis=TSNE(n_components=2,perplexity=45,n_jobs=-1)

#Transforming dataset 1 based on ideal 
dataset_transformed=tsne_vis.fit_transform(dataset_transformed_PCA)

fig=plt.figure()
fig, axes = plt.subplots(1,2,figsize=(25, 5))

plt.suptitle('Visualizations EM Clustering Results with Labels (PCA FMINST)',fontsize=18)
LABS=[LABELS,best_em_pca_d1.predict(dataset_transformed_PCA)]

position_dict={}

for lab in np.unique(LABELS):
    x,y=dataset_transformed[LABELS==lab][:,0].mean(),dataset_transformed[LABELS==lab][:,1].mean()
    position_dict[lab]=[x,y]

axis_int=0
for ax,labels_ in zip(axes,LABS):

    artists=[]
    sc=ax.scatter(dataset_transformed[:,0],dataset_transformed[:,1],c=labels_,cmap='viridis',zorder=1)
    # Overlay each image within the scatter plot

    for lab in np.unique(LABELS):
        # Positioning Images
        image_x = position_dict[lab][0]
        image_y = position_dict[lab][1]
        im=OffsetImage(DATASET[LABELS==lab][0].reshape(28,28),cmap='gray')
        ab = AnnotationBbox(im, (image_x, image_y), xycoords='data', frameon=False)
        artists.append(ax.add_artist(ab))

    if axis_int==0:
        ax.set_title(f'Actual')
        axis_int+=1
    else:
        ax.set_title(f'Clustered')

    ax.legend(*sc.legend_elements(), title='Label')
#plt.tight_layout()

#### K MODES

In [None]:
#Testing K_Modes algorithm with different number of components
dataset_name='FMNIST (PCA)'
dataset=dataset_transformed_PCA
labels_og=LABELS
algorithm_name='K-MODES'
categorical_cols=None

experiment_km_clusters(dataset=dataset,dataset_name=dataset_name,labels_og=labels_og,algorithm_name=algorithm_name,categorical_cols=categorical_cols)

In [None]:
categorical_cols=None
data_categorized=categorizer(dataset_transformed_PCA,categorical_cols)

#Best Number of Clusters
best_kmodes_PCA_d1=9
dataset=data_categorized

print("DATASET 1 PCA KMODES RESULTS \n\n")

mi_score=0
aj_score=0
repeats=5

for i in range(repeats):
    km = KModes(n_clusters=best_kmodes_PCA_d1, init='Huang', n_init=3, verbose=0)

    km.fit(data_categorized,categorical=np.arange(data_categorized.shape[-1]))
    mi_score+=normalized_mutual_info_score(LABELS,km.predict(dataset,categorical=np.arange(dataset.shape[-1])))/repeats
    aj_score+=adjusted_rand_score(LABELS,km.predict(dataset,categorical=np.arange(dataset.shape[-1])))/repeats


#Evaluation Normalized MI Score
print(f'The Normalized Mutual Info Score for the best clustering iS {mi_score}') 

#Evaluating Adjusted Rand Score Rand Score
print(f'The Adjusted Rand Score for the best clustering iS {aj_score}') 

In [None]:
#Plotting the Mean Cluster Image from the ideal number of clusters 
#Visualizing
categorical_cols=None
data_categorized=categorizer(dataset_transformed_PCA,categorical_cols)
km.fit(data_categorized,categorical=np.arange(data_categorized.shape[-1]))
tsne_vis=TSNE(n_components=2,perplexity=45,n_jobs=-1)

#Transforming dataset 1 based on ideal 
dataset_transformed=tsne_vis.fit_transform(dataset_transformed_PCA)

#fig=plt.figure()
fig, axes = plt.subplots(1,2,figsize=(25, 5))

position_dict={}

plt.suptitle('Visualizations K-MODES Clustering Results with Labels (PCA FMINST)',fontsize=18)
LABS=[LABELS,km.predict(data_categorized,categorical=data_categorized.shape[-1])]

for lab in np.unique(LABELS):
    x,y=dataset_transformed[LABELS==lab][:,0].mean(),dataset_transformed[LABELS==lab][:,1].mean()
    position_dict[lab]=[x,y]

axis_int=0
for ax,labels_ in zip(axes,LABS):

    artists=[]

    for l in np.unique(labels_):
        # Choose color from a colormap
        color = plt.cm.viridis(l / len(np.unique(labels_)))
        ax.scatter(dataset_transformed[:, 0][labels_ == l], dataset_transformed[:, 1][labels_ == l],
                   label=l, zorder=1, color=color)

    sc=ax.scatter(dataset_transformed[:,0],dataset_transformed[:,1],c=labels_,cmap='viridis',zorder=1)
    # Overlay each image within the scatter plot

    for lab in np.unique(LABELS):
        # Positioning Images
        image_x = position_dict[lab][0]
        image_y = position_dict[lab][1]
        im=OffsetImage(X_fm_scaled[LABELS==lab][0].reshape(28,28),cmap='gray')
        ab = AnnotationBbox(im, (image_x, image_y), xycoords='data', frameon=False)
        artists.append(ax.add_artist(ab))

    if axis_int==0:
        ax.set_title(f'Actual')
        axis_int+=1
    else:
        ax.set_title(f'Clustered')

    ax.legend()
plt.tight_layout()

## ICA

In [None]:
chosen_n_ica_d1=66

#Creating PCA
d1_ica=FastICA(n_components=chosen_n_ica_d1)

#Transforming Data
dataset_transformed_ICA=d1_ica.fit_transform(DATASET)

#### EM

In [None]:
#Testing EM algorithm with different number of components
dataset_name='FMNIST ICA'
dataset=dataset_transformed_ICA
labels_og=LABELS
algorithm_name='EM'

experiment_em_clusters(dataset=dataset,dataset_name=dataset_name,labels_og=labels_og,algorithm_name=algorithm_name)

In [None]:
#Evaluating Best Clusters
#best_clusters_ICA_d1=n_list[np.argmin(BIC_scores)]

#Selecting Based on plots
dataset=dataset_transformed_ICA

REPEATS=5

best_clusters_ICA_d1=18
best_em_ica_d1=GaussianMixture(n_components=best_clusters_ICA_d1,random_state=42)
best_em_ica_d1.fit(dataset)

mi_score=0
aj_score=0

for i in range(REPEATS):
    mi_score+=normalized_mutual_info_score(labels_og,best_em_ica_d1.predict(dataset))/REPEATS
    aj_score+=adjusted_rand_score(labels_og,best_em_ica_d1.predict(dataset))/REPEATS

print("ICA RESULTS \n\n")

#Evaluation Normalized MI Score
print(f'The Normalized Mutual Info Score for the best clustering is {mi_score}') 

#Evaluating Adjusted Rand Score Rand Score
print(f'The Adjusted Rand Score for the best clustering is {aj_score}') 

In [None]:
#Visualizing
tsne_vis=TSNE(n_components=2,perplexity=45,n_jobs=-1)

#Transforming dataset 1 based on ideal 
dataset_transformed=tsne_vis.fit_transform(DATASET)

fig=plt.figure()
fig, axes = plt.subplots(1,2,figsize=(25, 5))

plt.suptitle('Visualizations EM Clustering Results with Labels (ICA FMINST)',fontsize=18)
LABS=[LABELS,best_em_ica_d1.predict(dataset_transformed_ICA)]

position_dict={}

for lab in np.unique(LABELS):
    x,y=dataset_transformed[LABELS==lab][:,0].mean(),dataset_transformed[LABELS==lab][:,1].mean()
    position_dict[lab]=[x,y]

axis_int=0
for ax,labels_ in zip(axes,LABS):

    artists=[]
    sc=ax.scatter(dataset_transformed[:,0],dataset_transformed[:,1],c=labels_,cmap='viridis',zorder=1)
    # Overlay each image within the scatter plot

    for lab in np.unique(LABELS):
        # Positioning Images
        image_x = position_dict[lab][0]
        image_y = position_dict[lab][1]
        im=OffsetImage(DATASET[LABELS==lab][0].reshape(28,28),cmap='gray')
        ab = AnnotationBbox(im, (image_x, image_y), xycoords='data', frameon=False)
        artists.append(ax.add_artist(ab))

    if axis_int==0:
        ax.set_title(f'Actual')
        axis_int+=1
    else:
        ax.set_title(f'Clustered')

    ax.legend(*sc.legend_elements(), title='Label')

#### K MODES

In [None]:
#Testing K_Modes algorithm with different number of components
n_list=[2,3,5,7,10,15,25]
dataset_name='FMNIST (ICA)'
dataset=dataset_transformed_ICA
labels_og=LABELS
algorithm_name='K-MODES'
cluster_graphs=4
categorical_cols=None

data_categorized=categorizer(dataset,categorical_cols)

#Converting dataset to 2D so it's plottable
converter=TSNE(n_components=2)
dataset_2d=converter.fit_transform(data_categorized)
silhouette_scores=[]
all_labels=[]
wcss_scores=[]
KM_cost=[]
KM_dict={}


#Assigning Clusters
for n in n_list:
    #Creating Gaussian
    km = KModes(n_clusters=n, init='Huang', n_init=5, verbose=0)

    #Fitted labels dataset
    km.fit(data_categorized,categorical=np.arange(data_categorized.shape[-1]))

    #Predictions
    labels=km.predict(data_categorized,categorical=np.arange(data_categorized.shape[-1]))
    all_labels.append(labels)
    
    #Calculating Mutal Info Score
    #silhouette_scores.append(silhouette_score(data_categorized,labels))

    #WCSS Score
    wcss_scores.append(calculate_wcss(data_categorized,labels,km.cluster_centroids_))

    KM_dict[n]=km

    KM_cost.append(km.cost_)

#Plotting 2d color coded with labels 
fig,axes=plt.subplots(1,len(n_list[:cluster_graphs])+1)
fig.set_size_inches(35,5)

plt.suptitle(f'{dataset_name} dataset clusters using {algorithm_name}',fontsize=18)

for c,ax in enumerate(fig.axes):

    if c>-1:
        #Predictions
        labels=all_labels[c]

        #Plotting
        sc=ax.scatter(dataset_2d[:,0],dataset_2d[:,1],c=labels,cmap='plasma')
        #ax.legend()
        ax.set_ylabel('Y');
        ax.set_xlabel('X');
        ax.set_title('{} : COMPONENTS'.format(n_list[c]));
        ax.legend(*sc.legend_elements(), title='clusters')

    else:

        sc=ax.scatter(dataset_2d[:,0],dataset_2d[:,1],c=labels_og,cmap='plasma')
        #ax.legend()
        ax.set_ylabel('Y');
        ax.set_xlabel('X');
        ax.set_title('Dataset');      
        ax.legend(*sc.legend_elements(), title='clusters')

plt.tight_layout()

#Plotting WCSS
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('WCSS & Clusters')
plt.plot(n_list,wcss_scores)
plt.scatter(n_list, wcss_scores, s=100, c='blue', marker='o', edgecolors='black')
ax_2=plt.gca().twinx()
create_elbow_plot(n_list,wcss_scores,ax_2)
plt.xlabel('N_clusters')
plt.ylabel('WCSS')
plt.legend()

plt.tight_layout()

#Plotting Cost
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('Cost & Clusters')
plt.plot(n_list,KM_cost)
plt.scatter(n_list, KM_cost, s=100, c='blue', marker='o', edgecolors='black')
ax_2=plt.gca().twinx()
create_elbow_plot(n_list,KM_cost,ax_2)
plt.xlabel('N_clusters')
plt.ylabel('Cost')


plt.tight_layout()

#Plotting Silhouette score
# fig=plt.figure()
# fig.set_size_inches(10,5)

# plt.title('Silhouette_scores & Clusters')
# plt.plot(n_list,silhouette_scores)
# plt.scatter(n_list, silhouette_scores, s=100, c='blue', marker='o', edgecolors='black')
# plt.xlabel('N_clusters')
# plt.ylabel('Silhouette_scores')

# plt.tight_layout()

In [None]:
categorical_cols=None
data_categorized=categorizer(dataset_transformed_ICA,categorical_cols)

#Best Number of Clusters
best_kmodes_ICA_d1=15
dataset=data_categorized

#Evaluation

#Creating Best KMODES
km = KModes(n_clusters=best_kmodes_ICA_d1, init='Huang', n_init=3, verbose=0)

#Fitted labels dataset
km.fit(data_categorized,categorical=np.arange(data_categorized.shape[-1]))

print("DATASET 1 KMODES RESULTS \n\n")

#Evaluation Normalized MI Score
mi_score=normalized_mutual_info_score(LABELS,km.predict(dataset,categorical=np.arange(dataset.shape[-1])))
print(f'The Normalized Mutual Info Score for the best clustering iS {mi_score}') 

#Evaluating Adjusted Rand Score Rand Score
aj_score=adjusted_rand_score(LABELS,km.predict(dataset,categorical=np.arange(dataset.shape[-1])))
print(f'The Adjusted Rand Score for the best clustering iS {aj_score}') 

In [None]:
#Plotting the Mean Cluster Image from the ideal number of clusters 
#Visualizing
tsne_vis=TSNE(n_components=2,perplexity=45,n_jobs=-1)

#Transforming dataset 1 based on ideal 
dataset_transformed=tsne_vis.fit_transform(DATASET)

fig=plt.figure()
fig, axes = plt.subplots(1,2,figsize=(25, 5))

position_dict={}

plt.suptitle('Visualizations K-MODES Clustering Results with Labels (ICA FMINST)',fontsize=18)
LABS=[LABELS,km.predict(dataset,categorical=dataset.shape[-1])]

for lab in np.unique(LABELS):
    x,y=dataset_transformed[LABELS==lab][:,0].mean(),dataset_transformed[LABELS==lab][:,1].mean()
    position_dict[lab]=[x,y]

axis_int=0
for ax,labels_ in zip(axes,LABS):

    artists=[]
    sc=ax.scatter(dataset_transformed[:,0],dataset_transformed[:,1],c=labels_,cmap='viridis',zorder=1)
    # Overlay each image within the scatter plot

    for lab in np.unique(LABELS):
        # Positioning Images
        image_x = position_dict[lab][0]
        image_y = position_dict[lab][1]
        im=OffsetImage(X_fm_scaled[LABELS==lab][0].reshape(28,28),cmap='gray')
        ab = AnnotationBbox(im, (image_x, image_y), xycoords='data', frameon=False)
        artists.append(ax.add_artist(ab))

    if axis_int==0:
        ax.set_title(f'Actual')
        axis_int+=1
    else:
        ax.set_title(f'Clustered')

    ax.legend(*sc.legend_elements(), title='Label')
plt.tight_layout()

## Random Projection

In [None]:
chosen_n_rgp_d1=666

#Creating PCA
d1_rgp=GaussianRandomProjection(n_components=chosen_n_rgp_d1,random_state=42)

#Transforming Data
dataset_transformed_RGP=d1_rgp.fit_transform(DATASET)

#### EM

In [None]:
#Testing EM algorithm with different number of components
n_list=[2,3,5,7,9,10,15,25]
dataset_name='FMNIST RGP'
dataset=dataset_transformed_RGP
labels_og=LABELS
algorithm_name='EM'
cluster_graphs=4
repeats=3

#Converting dataset to 2D so it's plottable
converter=TSNE(n_components=2,perplexity=45)
dataset_2d=converter.fit_transform(dataset)

#Average of 3 runs 
silhouette_scores=np.zeros(shape=len(n_list))
wcss_scores=np.zeros(shape=len(n_list))
BIC_scores=np.zeros(shape=len(n_list))
AIC_scores=np.zeros(shape=len(n_list))
EM_dict={}

for r in [21*(1+i) for i in range(repeats)]:

    #Will be calculated for each run for plots
    all_labels=[]
    
    #Assigning Clusters
    for i,n in enumerate(n_list):
        #Creating Gaussian
        em=GaussianMixture(n_components=n,random_state=r)

        #Fitted labels dataset
        em.fit(dataset)

        #Predictions
        labels=em.predict(dataset)
        all_labels.append(labels)
        
        #Calculating Mutal Info Score
        silhouette_scores[i]=silhouette_score(dataset,labels)/repeats

        #WCSS Score
        wcss_scores[i]=calculate_wcss(dataset,labels,em.means_)/repeats

        #BIC Scores
        BIC_scores[i]=em.bic(dataset)/repeats

        #AIC
        AIC_scores[i]=em.aic(dataset)/repeats

        #Saving EM models for final Run 
        EM_dict[n]=em

    #Plotting 2d color coded with labels 
    fig,axes=plt.subplots(1,len(n_list[:cluster_graphs])+1)
    fig.set_size_inches(35,5)

    plt.suptitle(f'{dataset_name} dataset clusters using {algorithm_name}',fontsize=18)

    for c,ax in enumerate(fig.axes):

        if c>-1:
            #Predictions
            labels=all_labels[c]

            #Plotting
            sc=ax.scatter(dataset_2d[:,0],dataset_2d[:,1],c=labels,cmap='plasma')
            #ax.legend()
            ax.set_ylabel('Y');
            ax.set_xlabel('X');
            ax.set_title('{} : COMPONENTS'.format(n_list[c]));
            ax.legend(*sc.legend_elements(), title='clusters')

        else:

            sc=ax.scatter(dataset_2d[:,0],dataset_2d[:,1],c=labels_og,cmap='plasma')
            #ax.legend()
            ax.set_ylabel('Y');
            ax.set_xlabel('X');
            ax.set_title('Dataset');      
            ax.legend(*sc.legend_elements(), title='clusters')

    plt.tight_layout()

#Plotting Silhouette score
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('Silhouette_scores & Clusters')
plt.plot(n_list,silhouette_scores)
plt.scatter(n_list, silhouette_scores, s=100, c='blue', marker='o', edgecolors='black')
plt.xlabel('N_clusters')
plt.ylabel('Silhouette_scores')

plt.tight_layout()

#Plotting WCSS
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('WCSS & Clusters')
plt.plot(n_list,wcss_scores)
plt.scatter(n_list, wcss_scores, s=100, c='blue', marker='o', edgecolors='black')
plt.xlabel('N_clusters')
plt.ylabel('WCSS')

plt.tight_layout()

#Plotting BIC
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('BIC Score & Clusters')
plt.plot(n_list,BIC_scores)
plt.scatter(n_list, BIC_scores, s=100, c='blue', marker='o', edgecolors='black')
plt.xlabel('N_clusters')
plt.ylabel('BIC Score')

#Plotting AIC
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('AIC Score & Clusters')
plt.plot(n_list,AIC_scores)
plt.scatter(n_list, AIC_scores, s=100, c='blue', marker='o', edgecolors='black')
plt.xlabel('N_clusters')
plt.ylabel('AIC Score')

plt.tight_layout()

In [None]:
#Evaluating Best Clusters based on graphs
dataset=dataset_transformed_RGP

best_clusters_RGP_d1=5

for i in range(5):
    best_em_rgp_d1=GaussianMixture(n_components=best_clusters_RGP_d1,random_state=42)
    best_em_rgp_d1.fit(dataset)
    #best_em_igp_d1=EM_dict[best_clusters_RGP_d1]

    print("RGP RESULTS \n\n")

    #Evaluation Normalized MI Score
    mi_score=normalized_mutual_info_score(LABELS,best_em_rgp_d1.predict(dataset))
    print(f'The Normalized Mutual Info Score for the best clustering is {mi_score}') 

    #Evaluating Adjusted Rand Score Rand Score
    aj_score=adjusted_rand_score(LABELS,best_em_rgp_d1.predict(dataset))
    print(f'The Adjusted Rand Score for the best clustering is {aj_score}') 

In [None]:
#Visualizing
tsne_vis=TSNE(n_components=2,perplexity=45,n_jobs=-1)

#Transforming dataset 1 based on ideal 
dataset_transformed=tsne_vis.fit_transform(DATASET)

fig=plt.figure()
fig, axes = plt.subplots(1,2,figsize=(25, 5))

plt.suptitle('Visualizations EM Clustering Results with Labels (RGP FMINST)',fontsize=18)
LABS=[LABELS,best_em_rgp_d1.predict(dataset_transformed_RGP)]

position_dict={}

for lab in np.unique(LABELS):
    x,y=dataset_transformed[LABELS==lab][:,0].mean(),dataset_transformed[LABELS==lab][:,1].mean()
    position_dict[lab]=[x,y]

axis_int=0
for ax,labels_ in zip(axes,LABS):

    artists=[]
    sc=ax.scatter(dataset_transformed[:,0],dataset_transformed[:,1],c=labels_,cmap='viridis',zorder=1)
    # Overlay each image within the scatter plot

    for lab in np.unique(LABELS):
        # Positioning Images
        image_x = position_dict[lab][0]
        image_y = position_dict[lab][1]
        im=OffsetImage(DATASET[LABELS==lab][0].reshape(28,28),cmap='gray')
        ab = AnnotationBbox(im, (image_x, image_y), xycoords='data', frameon=False)
        artists.append(ax.add_artist(ab))

    if axis_int==0:
        ax.set_title(f'Actual')
        axis_int+=1
    else:
        ax.set_title(f'Clustered')

    ax.legend(*sc.legend_elements(), title='Label')

#### K MODES

## ISOMAP

In [None]:
chosen_n_iso_d1=20
choosen_n_iso_d1_neighbors=45

#Creating PCA
iso_map_d1=Isomap(n_components=chosen_n_iso_d1,n_neighbors=choosen_n_iso_d1_neighbors,n_jobs=-1)

#Transforming Data
dataset_transformed_ISO=iso_map_d1.fit_transform(DATASET)

#### EM

In [None]:
#Testing EM algorithm with different number of components
dataset_name='FMNIST ISO'
dataset=dataset_transformed_ISO
labels_og=LABELS
algorithm_name='EM'

experiment_em_clusters(dataset=dataset,dataset_name=dataset_name,labels_og=labels_og,algorithm_name=algorithm_name)

In [None]:
#Evaluating Best Clusters
dataset=dataset_transformed_ISO

#Selecting N based on Plots
best_clusters_ISO_d1=14
mi_score=0
aj_score=0
repeats=5

print("ISOMAP RESULTS \n\n")

for i in range(repeats):
    best_em_iso_d1=GaussianMixture(n_components=best_clusters_ISO_d1,random_state=21*i)

    best_em_iso_d1.fit(dataset)
    mi_score+=normalized_mutual_info_score(LABELS,best_em_iso_d1.predict(dataset))/repeats
    aj_score+=adjusted_rand_score(LABELS,best_em_iso_d1.predict(dataset))/repeats


#Evaluation Normalized MI Score
print(f'The Normalized Mutual Info Score for the best clustering iS {mi_score}') 

#Evaluating Adjusted Rand Score Rand Score
print(f'The Adjusted Rand Score for the best clustering iS {aj_score}') 

In [None]:
#Visualizing
tsne_vis=TSNE(n_components=2,perplexity=45,n_jobs=-1)

#Transforming dataset 1 based on ideal 
dataset_transformed=tsne_vis.fit_transform(dataset_transformed_ISO)

fig=plt.figure()
fig, axes = plt.subplots(1,2,figsize=(25, 5))

plt.suptitle('Visualizations EM Clustering Results with Labels (ISOMAP FMINST)',fontsize=18)
LABS=[LABELS,best_em_iso_d1.predict(dataset_transformed_ISO)]

position_dict={}

for lab in np.unique(LABELS):
    x,y=dataset_transformed[LABELS==lab][:,0].mean(),dataset_transformed[LABELS==lab][:,1].mean()
    position_dict[lab]=[x,y]

axis_int=0
for ax,labels_ in zip(axes,LABS):

    artists=[]
    sc=ax.scatter(dataset_transformed[:,0],dataset_transformed[:,1],c=labels_,cmap='viridis',zorder=1)
    # Overlay each image within the scatter plot

    for lab in np.unique(LABELS):
        # Positioning Images
        image_x = position_dict[lab][0]
        image_y = position_dict[lab][1]
        im=OffsetImage(DATASET[LABELS==lab][0].reshape(28,28),cmap='gray')
        ab = AnnotationBbox(im, (image_x, image_y), xycoords='data', frameon=False)
        artists.append(ax.add_artist(ab))

    if axis_int==0:
        ax.set_title(f'Actual')
        axis_int+=1
    else:
        ax.set_title(f'Clustered')

    ax.legend(*sc.legend_elements(), title='Label')

#### K MODES

In [None]:
#Testing K_Modes algorithm with different number of components
dataset_name='FMNIST ISO'
dataset=dataset_transformed_ISO
labels_og=LABELS
algorithm_name='K-MODES'
categorical_cols=None

experiment_km_clusters(dataset=dataset,dataset_name=dataset_name,labels_og=labels_og,algorithm_name=algorithm_name,categorical_cols=categorical_cols)

In [None]:
categorical_cols=None
data_categorized=categorizer(dataset_transformed_ISO,categorical_cols)

#Best Number of Clusters
best_kmodes_ISO_d1=11
dataset=data_categorized

print("DATASET 1 ISO KMODES RESULTS \n\n")

mi_score=0
aj_score=0
repeats=5

for i in range(repeats):
    km = KModes(n_clusters=best_kmodes_ISO_d1, init='Huang', n_init=3, verbose=0)

    km.fit(data_categorized,categorical=np.arange(data_categorized.shape[-1]))
    mi_score+=normalized_mutual_info_score(LABELS,km.predict(dataset,categorical=np.arange(dataset.shape[-1])))/repeats
    aj_score+=adjusted_rand_score(LABELS,km.predict(dataset,categorical=np.arange(dataset.shape[-1])))/repeats


#Evaluation Normalized MI Score
print(f'The Normalized Mutual Info Score for the best clustering iS {mi_score}') 

#Evaluating Adjusted Rand Score Rand Score
print(f'The Adjusted Rand Score for the best clustering iS {aj_score}') 

In [None]:
#Plotting the Mean Cluster Image from the ideal number of clusters 
#Visualizing
tsne_vis=TSNE(n_components=2,perplexity=45,n_jobs=-1)

#Transforming dataset 1 based on ideal 
dataset_transformed=tsne_vis.fit_transform(dataset_transformed_ISO)

fig=plt.figure()
fig, axes = plt.subplots(1,2,figsize=(25, 5))

position_dict={}

plt.suptitle('Visualizations K-MODES Clustering Results with Labels (ISOMAP FMINST)',fontsize=18)
LABS=[LABELS,km.predict(dataset,categorical=dataset.shape[-1])]

for lab in np.unique(LABELS):
    x,y=dataset_transformed[LABELS==lab][:,0].mean(),dataset_transformed[LABELS==lab][:,1].mean()
    position_dict[lab]=[x,y]

axis_int=0
for ax,labels_ in zip(axes,LABS):

    artists=[]

    for l in np.unique(labels_):
        # Choose color from a colormap
        color = plt.cm.viridis(l / len(np.unique(labels_)))
        ax.scatter(dataset_transformed[:, 0][labels_ == l], dataset_transformed[:, 1][labels_ == l],
                   label=l, zorder=1, color=color)

    sc=ax.scatter(dataset_transformed[:,0],dataset_transformed[:,1],c=labels_,cmap='viridis',zorder=1)
    # Overlay each image within the scatter plot

    for lab in np.unique(LABELS):
        # Positioning Images
        image_x = position_dict[lab][0]
        image_y = position_dict[lab][1]
        im=OffsetImage(X_fm_scaled[LABELS==lab][0].reshape(28,28),cmap='gray')
        ab = AnnotationBbox(im, (image_x, image_y), xycoords='data', frameon=False)
        artists.append(ax.add_artist(ab))

    if axis_int==0:
        ax.set_title(f'Actual')
        axis_int+=1
    else:
        ax.set_title(f'Clustered')

    ax.legend()
#plt.tight_layout()

## UMAP

#### EM

In [None]:
chosen_n_umap_d1=11
choosen_n_umap_d1_neighbors=45

#Creating UMAP
umap_map_d1=umap.UMAP(n_components=chosen_n_umap_d1,n_neighbors=choosen_n_umap_d1_neighbors,n_jobs=-1)

#Transforming Data
dataset_transformed_UMAP=umap_map_d1.fit_transform(DATASET)

In [None]:
#Testing EM algorithm with different number of components
n_list=[2,3,5,7,9,10,15,25,35,45]
dataset_name='FMNIST UMAP'
dataset=dataset_transformed_UMAP
labels_og=LABELS
algorithm_name='EM'
cluster_graphs=4
repeats=3

#Converting dataset to 2D so it's plottable
converter=TSNE(n_components=2,perplexity=45)
dataset_2d=converter.fit_transform(dataset)

#Average of 3 runs 
silhouette_scores=np.zeros(shape=len(n_list))
wcss_scores=np.zeros(shape=len(n_list))
BIC_scores=np.zeros(shape=len(n_list))
AIC_scores=np.zeros(shape=len(n_list))
EM_dict={}

for r in [21*(1+i) for i in range(repeats)]:

    #Will be calculated for each run for plots
    all_labels=[]
    
    #Assigning Clusters
    for i,n in enumerate(n_list):
        #Creating Gaussian
        em=GaussianMixture(n_components=n,random_state=r)

        #Fitted labels dataset
        em.fit(dataset)

        #Predictions
        labels=em.predict(dataset)
        all_labels.append(labels)
        
        #Calculating Mutal Info Score
        silhouette_scores[i]=silhouette_score(dataset,labels)/repeats

        #WCSS Score
        wcss_scores[i]=calculate_wcss(dataset,labels,em.means_)/repeats

        #BIC Scores
        BIC_scores[i]=em.bic(dataset)/repeats

        #AIC
        AIC_scores[i]=em.aic(dataset)/repeats

        #Saving EM models for final Run 
        EM_dict[n]=em

    #Plotting 2d color coded with labels 
    fig,axes=plt.subplots(1,len(n_list[:cluster_graphs])+1)
    fig.set_size_inches(35,5)

    plt.suptitle(f'{dataset_name} dataset clusters using {algorithm_name}',fontsize=18)

    for c,ax in enumerate(fig.axes):

        if c>-1:
            #Predictions
            labels=all_labels[c]

            #Plotting
            sc=ax.scatter(dataset_2d[:,0],dataset_2d[:,1],c=labels,cmap='plasma')
            #ax.legend()
            ax.set_ylabel('Y');
            ax.set_xlabel('X');
            ax.set_title('{} : COMPONENTS'.format(n_list[c]));
            ax.legend(*sc.legend_elements(), title='clusters')

        else:

            sc=ax.scatter(dataset_2d[:,0],dataset_2d[:,1],c=labels_og,cmap='plasma')
            #ax.legend()
            ax.set_ylabel('Y');
            ax.set_xlabel('X');
            ax.set_title('Dataset');      
            ax.legend(*sc.legend_elements(), title='clusters')

    plt.tight_layout()

#Plotting Silhouette score
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('Silhouette_scores & Clusters')
plt.plot(n_list,silhouette_scores)
plt.scatter(n_list, silhouette_scores, s=100, c='blue', marker='o', edgecolors='black')
plt.xlabel('N_clusters')
plt.ylabel('Silhouette_scores')

plt.tight_layout()

#Plotting WCSS
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('WCSS & Clusters')
plt.plot(n_list,wcss_scores)
plt.scatter(n_list, wcss_scores, s=100, c='blue', marker='o', edgecolors='black')
plt.xlabel('N_clusters')
plt.ylabel('WCSS')

plt.tight_layout()

#Plotting BIC
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('BIC Score & Clusters')
plt.plot(n_list,BIC_scores)
plt.scatter(n_list, BIC_scores, s=100, c='blue', marker='o', edgecolors='black')
plt.xlabel('N_clusters')
plt.ylabel('BIC Score')

#Plotting AIC
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('AIC Score & Clusters')
plt.plot(n_list,AIC_scores)
plt.scatter(n_list, AIC_scores, s=100, c='blue', marker='o', edgecolors='black')
plt.xlabel('N_clusters')
plt.ylabel('AIC Score')

plt.tight_layout()

#### K MODES

In [None]:
#Testing K_Modes algorithm with different number of components
n_list=[2,3,5,7,10,15,25, 35, 45]
dataset_name='FMNIST (UMAP)'
dataset=dataset_transformed_UMAP
labels_og=LABELS
algorithm_name='K-MODES'
cluster_graphs=4
categorical_cols=None

data_categorized=categorizer(dataset,categorical_cols)

#Converting dataset to 2D so it's plottable
converter=TSNE(n_components=2)
dataset_2d=converter.fit_transform(data_categorized)
silhouette_scores=[]
all_labels=[]
wcss_scores=[]
KM_dict={}

#Assigning Clusters
for n in tqdm(n_list):
    #Creating KMODES
    km = KModes(n_clusters=n, init='Huang', n_init=3, verbose=0)

    #Fitted labels dataset
    km.fit(data_categorized,categorical=np.arange(data_categorized.shape[-1]))

    #Predictions
    labels=km.predict(data_categorized,categorical=np.arange(data_categorized.shape[-1]))
    all_labels.append(labels)
    
    #Calculating Mutal Info Score
    silhouette_scores.append(silhouette_score(data_categorized,labels))

    #WCSS Score
    wcss_scores.append(calculate_wcss(data_categorized,labels,km.cluster_centroids_))

    KM_dict[n]=km

#Plotting 2d color coded with labels 
fig,axes=plt.subplots(1,len(n_list[:cluster_graphs])+1)
fig.set_size_inches(35,5)

plt.suptitle(f'{dataset_name} dataset clusters using {algorithm_name}',fontsize=18)

for c,ax in enumerate(fig.axes):

    if c>-1:
        #Predictions
        labels=all_labels[c]

        #Plotting
        sc=ax.scatter(dataset_2d[:,0],dataset_2d[:,1],c=labels,cmap='plasma')
        #ax.legend()
        ax.set_ylabel('Y');
        ax.set_xlabel('X');
        ax.set_title('{} : COMPONENTS'.format(n_list[c]));
        ax.legend(*sc.legend_elements(), title='clusters')

    else:

        sc=ax.scatter(dataset_2d[:,0],dataset_2d[:,1],c=labels_og,cmap='plasma')
        #ax.legend()
        ax.set_ylabel('Y');
        ax.set_xlabel('X');
        ax.set_title('Dataset');      
        ax.legend(*sc.legend_elements(), title='clusters')

plt.tight_layout()

#Plotting WCSS
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('WCSS & Clusters')
plt.plot(n_list,wcss_scores)
plt.scatter(n_list, wcss_scores, s=100, c='blue', marker='o', edgecolors='black')
plt.xlabel('N_clusters')
plt.ylabel('WCSS')

plt.tight_layout()

#Plotting Silhouette score
fig=plt.figure()
fig.set_size_inches(10,5)

plt.title('Silhouette_scores & Clusters')
plt.plot(n_list,silhouette_scores)
plt.scatter(n_list, silhouette_scores, s=100, c='blue', marker='o', edgecolors='black')
plt.xlabel('N_clusters')
plt.ylabel('Silhouette_scores')

plt.tight_layout()

In [None]:
#Best Number of Clusters
best_kmodes_UMAP_d1=15
data_categorized=categorizer(dataset_transformed_UMAP,None)
dataset=data_categorized

#Evaluation

#Creating Best KMODES
km = KModes(n_clusters=best_kmodes_UMAP_d1, init='Huang', n_init=3, verbose=0)

#Fitted labels dataset
km.fit(dataset,categorical=np.arange(dataset.shape[-1]))

print("DATASET 1 KMODES RESULTS \n\n")

#Evaluation Normalized MI Score
mi_score=normalized_mutual_info_score(LABELS,km.predict(dataset,categorical=np.arange(dataset.shape[-1])))
print(f'The Normalized Mutual Info Score for the best clustering is {mi_score}') 

#Evaluating Adjusted Rand Score Rand Score
aj_score=adjusted_rand_score(LABELS,km.predict(dataset,categorical=np.arange(dataset.shape[-1])))
print(f'The Adjusted Rand Score for the best clustering is {aj_score}') 

In [None]:
#Plotting the Mean Cluster Image from the ideal number of clusters 
#Visualizing Results from KFOLDS
tsne_vis=TSNE(n_components=2,perplexity=45,n_jobs=-1)

#Transforming dataset 1 based on ideal 
dataset_transformed=tsne_vis.fit_transform(DATASET)

fig=plt.figure()
fig, axes = plt.subplots(1,2,figsize=(25, 5))

position_dict={}

plt.suptitle('Visualizations K-MODES Clustering Results with Labels (UMAP FMINST)',fontsize=18)
LABS=[LABELS,km.predict(dataset,categorical=dataset.shape[-1])]

for lab in np.unique(LABELS):
    x,y=dataset_transformed[LABELS==lab][:,0].mean(),dataset_transformed[LABELS==lab][:,1].mean()
    position_dict[lab]=[x,y]

axis_int=0
for ax,labels_ in zip(axes,LABS):

    artists=[]
    sc=ax.scatter(dataset_transformed[:,0],dataset_transformed[:,1],c=labels_,cmap='viridis',zorder=1,label=np.unique(labels_))
    # Overlay each image within the scatter plot

    for lab in np.unique(LABELS):
        # Positioning Images
        image_x = position_dict[lab][0]
        image_y = position_dict[lab][1]
        im=OffsetImage(X_fm_scaled[LABELS==lab][0].reshape(28,28),cmap='gray')
        ab = AnnotationBbox(im, (image_x, image_y), xycoords='data', frameon=False)
        artists.append(ax.add_artist(ab))

    if axis_int==0:
        ax.set_title(f'Actual')
        axis_int+=1
    else:
        ax.set_title(f'Clustered')

    ax.legend(*sc.legend_elements(), title='Label')
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(25, 5))

position_dict = {}

plt.suptitle('Visualizations K-MODES Clustering Results with Labels (UMAP FMINST)', fontsize=18)
LABS = [LABELS, km.predict(dataset, categorical=dataset.shape[-1])]

for lab in np.unique(LABELS):
    x, y = dataset_transformed[labels == lab][:, 0].mean(), dataset_transformed[labels == lab][:, 1].mean()
    position_dict[lab] = [x, y]

axis_int = 0
for ax, labels_ in zip(axes, LABS):
    artists = []
    for lab in np.unique(LABELS):
        # Plot points
        points = dataset_transformed[labels_ == lab]
        sc = ax.scatter(points[:, 0], points[:, 1], label=lab, cmap='viridis')

        # Overlay images
        for i, (x, y) in enumerate(points):
            im = OffsetImage(X_fm_scaled[labels_ == lab][i].reshape(28, 28), cmap='gray')
            ab = AnnotationBbox(im, (x, y), xycoords='data', frameon=False)
            artists.append(ax.add_artist(ab))

    if axis_int == 0:
        ax.set_title(f'Actual')
        axis_int += 1
    else:
        ax.set_title(f'Clustered')

plt.tight_layout()
plt.legend(*sc.legend_elements(), title='Label')
plt.show()