# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Layer, InputSpec, Dense, Input
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from tensorflow.keras import backend as K
from sklearn.preprocessing import StandardScaler


# Load the entire model from the HDF5 file with custom_objects

In [None]:
class ClusteringLayer(Layer):
    def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
        super(ClusteringLayer, self).__init__(**kwargs)
        self.n_clusters = n_clusters
        self.alpha = alpha
        self.initial_weights = weights
        self.input_spec = InputSpec(ndim=2)

    def build(self, input_shape):
        assert len(input_shape) == 2
        input_dim = input_shape[1]
        self.input_spec = InputSpec(dtype='float32', shape=(None, input_dim))
        self.clusters = self.add_weight(shape=(self.n_clusters, input_dim), initializer='glorot_uniform',
                                        name='clusters')
        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights
        self.built = True

    def call(self, inputs, **kwargs):
        q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
        q **= (self.alpha + 1.0) / 2.0
        q = K.transpose(K.transpose(q) / K.sum(q, axis=1))
        return q

    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) == 2
        return input_shape[0], self.n_clusters

# Load the entire model from the HDF5 file with custom_objects
loaded_autoencoder = load_model("autoencoder_model_dec.h5")
loaded_dec_model = load_model("dec_model.h5", custom_objects={'ClusteringLayer': ClusteringLayer})


# Load the dataset and labelling the clusters

In [8]:
user_df = pd.read_csv("C:\\Users\\kasun\\Downloads\\yelp\\data\\yelp_academic_dataset_user_preprocessed.csv")

# Extract 'user_id' column
user_ids = user_df['user_id']

numeric_features = user_df.drop(['user_id', 'num', 'name', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos'], axis=1)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_features)

q, original_representations = loaded_dec_model.predict(scaled_data)
original_cluster_assignments = np.argmax(q, axis=1)

# Add cluster labels and 3D representations to the original dataset
user_df['cluster_dec'] = original_cluster_assignments
user_df['representation_1'] = original_representations[:, 0]
user_df['representation_2'] = original_representations[:, 1]
user_df['representation_3'] = original_representations[:, 2]



# Convert representations to Pinecone-compatible format
representations_as_arrays = user_df[['representation_1', 'representation_2', 'representation_3']].to_numpy()

# Flatten the arrays into a single-dimensional array
representations_as_vectors = representations_as_arrays.flatten()

user_df['representations_as_vectors_dec'] = user_df[['representation_1', 'representation_2', 'representation_3']].values.tolist()
# Add back the 'user_id'
user_df['user_id'] = user_ids

original_cluster_counts = user_df.groupby('cluster_dec').size().reset_index(name='count')
print(original_cluster_counts)


   cluster_dec   count
0            0  467603
1            1     300
2            2     253
3            3     343
4            4    3325
5            5  764632
6            6   22333
7            7   61927
8            8  667181


In [9]:
user_df.head()

Unnamed: 0,num,user_id,name,review_count,useful,funny,cool,fans,average_stars,compliment_hot,...,compliment_writer,compliment_photos,elite_years,is_elite,years_on_platform,cluster_dec,representation_1,representation_2,representation_3,representations_as_vectors_dec
0,0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,7217,1259,5994,267,3.91,250,...,239,180,1,1,16,4,0.163887,0.165749,0.164215,"[0.16388708353042603, 0.16574923694133759, 0.1..."
1,1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,43091,13066,27281,3138,3.74,1145,...,1521,1946,14,1,14,4,0.166433,0.16633,0.166578,"[0.1664333939552307, 0.16633014380931854, 0.16..."
2,2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2086,1010,1003,52,3.32,89,...,35,18,5,1,15,0,0.163177,0.172831,0.163703,"[0.16317729651927948, 0.17283140122890472, 0.1..."
3,3,SZDeASXq7o05mMNLshsdIA,Gwen,224,512,330,299,28,4.27,24,...,10,9,3,1,18,7,0.15831,0.180835,0.160942,"[0.1583097130060196, 0.18083520233631134, 0.16..."
4,4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,29,15,7,1,3.54,1,...,0,0,0,0,16,8,0.167469,0.168774,0.167819,"[0.16746939718723297, 0.16877354681491852, 0.1..."


In [10]:
user_df.drop(['num', 'representation_1', 'representation_2', 'representation_3'], axis=1, inplace=True)

In [11]:
user_df.head()

Unnamed: 0,user_id,name,review_count,useful,funny,cool,fans,average_stars,compliment_hot,compliment_more,...,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,elite_years,is_elite,years_on_platform,cluster_dec,representations_as_vectors_dec
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,7217,1259,5994,267,3.91,250,65,...,844,467,467,239,180,1,1,16,4,"[0.16388708353042603, 0.16574923694133759, 0.1..."
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,43091,13066,27281,3138,3.74,1145,264,...,7054,3131,3131,1521,1946,14,1,14,4,"[0.1664333939552307, 0.16633014380931854, 0.16..."
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2086,1010,1003,52,3.32,89,13,...,96,119,119,35,18,5,1,15,0,"[0.16317729651927948, 0.17283140122890472, 0.1..."
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,512,330,299,28,4.27,24,4,...,16,26,26,10,9,3,1,18,7,"[0.1583097130060196, 0.18083520233631134, 0.16..."
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,29,15,7,1,3.54,1,1,...,1,0,0,0,0,0,0,16,8,"[0.16746939718723297, 0.16877354681491852, 0.1..."


In [12]:
user_df.to_csv("user_with_clusters.csv", index=False)