In [1]:
import numpy as np

from utils_sub import u_calculation
from utils import cost_function
from utils import update_Z
from utils import sub_weight_update

In [2]:
import numpy as np
def weighted_distance(s1, s2, weight_vec, beta):
    """Calculate the weighted distance between two samples s1 and  s2 and based on the weights that each feature has

    Args:
        s1 (ndarray): _description_
        s2 (ndarray): _description_
        weight_vec (ndarray): a vector of size (n_features, ), each element is the weight of corresponding feature.
        beta (scaler): the power of weights vector

    Returns:
        scaler: the weighted distance between two samples
    """
    distance_vec = np.square(s1 - s2) # Element_wise --> for each feature
    # w**beta
    weights_beta_vector = np.power(weight_vec, beta)

    weighted_distance = np.dot(distance_vec, weights_beta_vector.T)
    return weighted_distance

In [3]:
def sub_closest_center(sample, centers, weight_matrix, beta=2):
    """it takes a sample and compare its distance to centers of clusters and return the cluster with closest center.

    Args:
        sample (ndarray): A vector that represent a data point
        centers (ndarray): A ndarray with the shape of (n_clusters, n_features) where each row represent a center of a cluster
        weight_vector (ndarray): a vector of wights for corresponding feature
        beta (scaler): a bridge that bring
    Returns:
        int: the number of cluster which is closest to the samples
    """
    d = [] # list of weighted distances
    for ind, c in enumerate(centers):
        w_d = weighted_distance(sample, c, weight_matrix[ind], beta)# --> for each cluster, its specific set of weights are used 
        d.append(w_d)
    assigned_cluster = np.argmin(d)
    return assigned_cluster



In [4]:

def u_calculation(data, centers, weights, beta = 2):
   """ Calculate U based on Z and W and our dataset
   """
   n_spl = data.shape[0] # umber of samples
   n_clu = centers.shape[0] # number of clusters
   u_matrix = np.zeros((n_spl, n_clu))
   for i, x in enumerate(data):
      l = sub_closest_center(x, centers, weights, beta)
      u_matrix[i,l] = 1
   return u_matrix

In [23]:
def clusters_vec(U):
    # a vector of size (n_samples, ) which each element shows the cluster that each samples is assigned to
    n_samples = U.shape[0]
    c_vec = np.zeros(n_samples)
    for m, u in enumerate(U):
        c_vec[m] = np.argmax(u)
    c_vec = c_vec.astype(int)
    return c_vec

In [24]:
def clusters_vec(U):
    # a vector of size (n_samples, ) which each element shows the cluster that each samples is assigned to
    n_samples = U.shape[0]
    c_vec = np.zeros(n_samples)
    for m, u in enumerate(U):
        c_vec[m] = np.argmax(u)
    c_vec = c_vec.astype(int)
    return c_vec

In [46]:
def cost_function(U, Z, W_matrix, X, beta = 2):
    """Calculate the cost function

    Args:
        U (ndarray):  U is an (M, k) matrix, ui,l is a binary variable, and ui,l = 1 indicates that record i is allocated to cluster l.
        Z (ndarray): is a set of k vectors representing the k-cluster centers of size (n_clusters, n_features)
        W_matrix (ndarray): a matrix of size (n_clusters, n_features) so that each row is W = [w1, w2, ..., wN ] is a set of weights  for cluster c.
        X (ndarray): matrix of records (n_records, n_features)
        beta (int, optional): The power of elements of weights vector Defaults to 2.
    """
    P = 0 # initial value of cost

    cl_vec = clusters_vec(U)
    
    # Updating P
    for m, c in enumerate(cl_vec):
        w_d = weighted_distance(X[m], Z[c], W_matrix[c], beta)
        P += w_d
        P =  P.item() # to convert it to a single scaler
    return(P)


In [26]:

def clusters_dict(U):
    # Finding the index of samples in each cluster
    n_clusters = U.shape[1]
    cluster_dict = {}
    clu_vec = clusters_vec(U)

    for i in range(n_clusters):
        cluster_dict[i] = np.where(clu_vec == i)[0]
        
    return cluster_dict

In [27]:

def update_Z(U, Z, X):
    """Update Z i.e. the centers of clusters, by taking mean of teh samples in each cluster
    """
    cluster_dict = clusters_dict(U)

    new_Z = np.zeros_like(Z)
    for i,ind in cluster_dict.items():
        new_Z[i] = np.mean(X[ind], axis=0)
    
    return new_Z

In [28]:



def dj(X, U, Z):
    # Iteration over all features to calculate Dj for each feature
    
    cluster_dict = clusters_dict(U)
    n_features = X.shape[1]
    n_clusters = U.shape[1]
    
    D = []
    for j in range(n_features):
        d_j = 0
        for l in range(n_clusters):
            inx_in_cluster = cluster_dict[l]
            # Distance for feature "j" in cluster "l"
            d_j_l =np.sum(np.square(X[inx_in_cluster][j]-Z[l][j]))
            d_j += d_j_l

        D.append(d_j)
    return D

In [29]:

def weight_update(X, U, Z, weights, beta=2):

    # D calculation:
    D = dj(X, U, Z)

    
    # weights_update
    weights_upd = np.zeros_like(weights)


    # wherever D is zero, the corresponding weight is zero
    ind_D_zero = np.where(D == 0 )[0] # indexes of zero Dj
    weights_upd[ind_D_zero] = 0

    # D is not zero
    ind_D_not_zero = np.where(D)[0] ## indexes of non-zero Dj
    for j in ind_D_not_zero:
        
        Dj_Dt = 0
        for t in ind_D_not_zero:
            Dj_Dt += (D[j] / D[t]) ** (1 / ( beta - 1) )
        
        weights_upd[j] = 1 / Dj_Dt

    return weights_upd

In [51]:

#################################
# for subspace weighted k_means:


def sub_dj(X, U, Z):
    """ Iteration over all features to calculate D (dispersion) for each feature in each subspace or cluster

    Args:
        U (ndarray):  U is an (M, k) matrix, ui,l is a binary variable, and ui,l = 1 indicates that record i is allocated to cluster l.
        Z (ndarray): is a set of k vectors representing the k-cluster centers of size (n_clusters, n_features)
        X (ndarray): matrix of records (n_records, n_features)


    Returns:
        D(nd.array): a  matrix of size (n_clusters, n_features), where element [l,j] is the dispersion for cluster l and feature j.
    """

    
    cluster_dict = clusters_dict(U)
    n_features = X.shape[1]
    n_clusters = U.shape[1]
    
    D = np.empty((n_clusters,n_features)) # each row is for each cluster and each column is for each feature
    for l in range(n_clusters):
        inx_in_cluster = cluster_dict[l]
        for j in range(n_features):
            # Distance for feature "j" in cluster "l"
            D[l, j] =np.sum(np.square(X[inx_in_cluster][j]-Z[l][j]))

    return D

In [54]:
def sub_weight_update(X, U, Z, weights, beta=2):
    
    n_clusters = weights.shape[0]
    n_features = weights.shape[1]


    # D calculation:
    D = sub_dj(X, U, Z)

    # weights_update
    weights_upd = np.empty_like(weights) # a matrix of size (n_clusters, n_features)

    for l in range(n_clusters):
        for j in range(n_features):
            # calculation of sum of {( D[lj] / D[lt] ) ** (1 / ( beta - 1) )} for all t, 1 < t < n_features
            Dlj_Dlt = 0
            for t in range(n_features):
                Dlj_Dlt += (D[l,j] / D[l,t]) ** (1 / ( beta - 1) )
            
            weights_upd[l, j] = 1 / Dlj_Dlt

    return weights_upd

### testing

In [47]:
def sub_w_k_means(X, k, beta=2):
    """Put data (X) in k clusters based on the weight of each feature 
    which is going to be calculated based on X and a user defined parameter beta

    Args:
        X (ndarray): input data(without label)
        k (int): the number of cluster
        beta (float):  user defined parameter that is used in the definition of the loss function
    
    """

    n_samples = X.shape[0] # Number of samples
    n_features = X.shape[1] # Number of features
    n_clusters = k # Number of clusters

    # Dictionary of history
    history = {
        'U': [],
        'Z': [],
        'W': [],
        'cost': []
        }
    time_step = 0

    # initial centers randomly by choosing from hte dataset randomly
    Z_initial_index = np.random.choice(range(n_samples), size=n_clusters)
    # Centers of clusters of random samples of data, but they can be any random data_points not necessarily in dataset
    Z = X[Z_initial_index , : ]
    history['Z'].append(Z)

    # Generate random weights that sum up to 1
    weights = np.random.dirichlet(np.ones(n_features), size=n_clusters)
    # weights = np.random.dirichlet(np.ones(n_features), size=1).squeeze()
    history['W'].append(weights)

    #Calculating  U
    U = u_calculation(X, Z, weights)
    history['U'].append(U)

    #Calculating cost function
    c_t = cost_function(U, Z, weights, X, beta)
    history['cost'].append(c_t)

    Z_t = update_Z(U, Z, X) # new update of Z
    history['Z'].append(Z_t)

    # Update cost
    c_t = cost_function(U, Z_t, weights, X, beta) 
    history['cost'].append(c_t)

    # weight update
    weights_t = sub_weight_update(X, U, Z, weights, beta=2)
    history['W'].append(weights_t)

    c_t = cost_function(U, Z, weights_t, X, beta = 2)
    history['cost'].append(c_t)



    # put every thing together go for a while loop
    cost_difference = []

    while True:
        cost_difference = np.abs(history['cost'][-1] - history['cost'][-2])
        if  cost_difference > 0.0001:

            # P1 --> update U
            Z = history['Z'][-1] # the last update of Z
            weights = history['W'][-1] # the last update of W
            U = u_calculation(X, Z, weights)
            history['U'].append(U)
            U = history['U'][-1]
            # update cost
            c_t = cost_function(U, Z, weights, X, beta)
            history['cost'].append(c_t)
        else:
            break


        #P2 --> update Z
        cost_difference = np.abs(history['cost'][-1] - history['cost'][-2])
        if  cost_difference > 0.0001:
            U = history['U'][-1] # the last update of U
            Z = history['Z'][-1] # the last update of Z
            weights = history['W'][-1] # the last update of weights

            Z_t = update_Z(U, Z, X) # new update of Z
            history['Z'].append(Z_t)
            # Update cost
            c_t = cost_function(U, Z_t, weights, X, beta) 
            history['cost'].append(c_t)
        else:
            break


        # P3 --> update  weights
        cost_difference = np.abs(history['cost'][-1] - history['cost'][-2])
        if  cost_difference > 0.0001:
            U = history['U'][-1] # the lsat update of U
            Z = history['Z'][-1] # the lsat update of Z
            weights_t = sub_weight_update(X, U, Z, weights, beta)
            history['W'].append(weights_t)
            #update cost
            c_t = cost_function(U, Z, weights_t, X, beta)
            history['cost'].append(c_t)
        else:
            break

    
    return history

# if __name__ == "__main__":

#     # producing clustering dataset
#     from sklearn.datasets import make_blobs

#     n_samples = 200
#     n_features = 6
#     n_clusters = 3
#     data, y = make_blobs(n_samples=n_samples, n_features=n_features , centers=n_clusters, random_state=42)

#     hist = sub_w_k_means(data, n_clusters, beta=2)
#     print(hist['W'][-1])
#     print(np.sum(hist['W'][-1]))

In [57]:
hist = sub_w_k_means(data, n_clusters, beta=2)
print(hist['W'][-1])
print(np.sum(hist['W'][-1], axis=1))

[[0.25999461 0.09041776 0.18065545 0.23687374 0.12249849 0.10955995]
 [0.09315055 0.17037752 0.30061339 0.21207378 0.09579951 0.12798525]
 [0.05832977 0.18638139 0.15280761 0.16155574 0.27562388 0.16530161]]
[1. 1. 1.]
