In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.metrics.pairwise import euclidean_distances

In [2]:
clean_df = pd.read_csv('data/clean_data.csv')
clean_df

Unnamed: 0,Names,Genders,City,17-25,26-34,35-43,44-52,52+,Math Tutor,Science Tutor,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
0,Novia Rizki Wulandari,1,Bandung,1,0,0,0,0,1,0,...,5,4,4,5,5,4,4,2,4,4
1,Nabhan Nabilah,1,Bandung,1,0,0,0,0,0,0,...,5,2,5,5,5,2,2,5,5,2
2,Herlina Kusyanuri Putri,1,Bandung,1,0,0,0,0,0,0,...,5,5,5,5,5,4,3,3,4,3
3,Rifdah Alyaa,1,Makassar,1,0,0,0,0,1,0,...,2,3,2,3,4,3,2,3,4,3
4,Salsabilla,1,Bandung,1,0,0,0,0,0,1,...,4,3,3,4,4,2,2,2,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,Usamah,0,Bandung,1,0,0,0,0,0,0,...,5,5,4,5,5,4,4,5,5,5
97,Helena,1,Bandung,0,0,0,0,0,1,0,...,4,3,4,4,4,3,4,2,5,4
98,Deva,1,Bandung,0,1,0,0,0,0,0,...,4,4,4,3,4,3,4,2,4,3
99,Dzaky,0,Bandung,1,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,4,5


In [3]:
# Check that we not missing any value after combine columns
print('Is there any missing value? ', clean_df.isnull().values.any())
print('How many missing values? ', clean_df.isnull().values.sum())

Is there any missing value?  False
How many missing values?  0


In [4]:
# Copy origin user data into match data to use in cluster
match_df = clean_df.copy()

In [5]:
# Drop Column that not use for clustering match
match_df.drop(['Names', 'City'], axis=1, inplace=True)
match_df

Unnamed: 0,Genders,17-25,26-34,35-43,44-52,52+,Math Tutor,Science Tutor,Social Tutor,Technology Tutor,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
0,1,1,0,0,0,0,1,0,0,0,...,5,4,4,5,5,4,4,2,4,4
1,1,1,0,0,0,0,0,0,1,0,...,5,2,5,5,5,2,2,5,5,2
2,1,1,0,0,0,0,0,0,1,0,...,5,5,5,5,5,4,3,3,4,3
3,1,1,0,0,0,0,1,0,0,0,...,2,3,2,3,4,3,2,3,4,3
4,1,1,0,0,0,0,0,1,0,0,...,4,3,3,4,4,2,2,2,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0,1,0,0,0,0,0,0,0,0,...,5,5,4,5,5,4,4,5,5,5
97,1,0,0,0,0,0,1,0,0,0,...,4,3,4,4,4,3,4,2,5,4
98,1,0,1,0,0,0,0,0,0,0,...,4,4,4,3,4,3,4,2,4,3
99,0,1,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,4,5


In [6]:
match_array = match_df.values
match_array.shape

(101, 39)

In [10]:
class PCAModel(tf.Module):
    def __init__(self, x_mean, components):
        super(PCAModel, self).__init__()
        self.x_mean = tf.Variable(x_mean, trainable=False)
        self.components = tf.Variable(components, trainable=False)

    @tf.function(input_signature=[tf.TensorSpec(shape=(None, 39), dtype=tf.float32)])
    def scale_features(self, x):
        x_min = tf.reduce_min(x, axis=0)
        x_max = tf.reduce_max(x, axis=0)

        # Handle columns with zero variance
        zero_variance_mask = tf.math.equal(x_min, x_max)
        non_zero_variance_mask = tf.math.logical_not(zero_variance_mask)

        # Scale the columns with non-zero variance
        x_scaled_non_zero = tf.where(non_zero_variance_mask, (x - x_min) / (x_max - x_min), x)

        # Scale the columns with zero variance
        x_scaled_zero = tf.where(zero_variance_mask, tf.zeros_like(x), x_scaled_non_zero)

        return x_scaled_zero

    @tf.function(input_signature=[tf.TensorSpec(shape=(None, 39), dtype=tf.float32)])
    def apply_pca(self, x):
        x_scaled = self.scale_features(x)
        x_centered = x_scaled - self.x_mean
        
        # Compute the covariance matrix
        covariance_matrix = tf.matmul(tf.transpose(x_centered), x_centered) / tf.cast(tf.shape(x_centered)[0], dtype=tf.float32)

        # Perform eigenvalue decomposition
        eigenvalues, eigenvectors = tf.linalg.eigh(covariance_matrix)
        
        # Sort eigenvectors based on eigenvalues
        sorted_indices = tf.argsort(eigenvalues, direction='DESCENDING')
        sorted_eigenvectors = tf.gather(eigenvectors, sorted_indices, axis=1)
        
        # Select the top k eigenvectors
        k = tf.minimum(tf.shape(sorted_eigenvectors)[1], 2)  # Choose the top 2 eigenvectors (modify as needed)
        selected_eigenvectors = sorted_eigenvectors[:, :k]
        
        # Project the centered data onto the selected eigenvectors
        x_pca = tf.matmul(x_centered, selected_eigenvectors)
        
        return x_pca

# Convert the numpy array to a TensorFlow tensor
x_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

# Perform PCA with k=2 (reduce to 2 dimensions)
x_mean = tf.reduce_mean(x_tf, axis=0)
_, _, V = tf.linalg.svd(x_tf - x_mean)
components = V[:, :2]

# Create an instance of the PCA model
pca_model = PCAModel(x_mean, components)

In [11]:
# Save the PCAModel as a SavedModel
tf.saved_model.save(
    pca_model,
    export_dir='pca_model/1/',
    signatures={
        'serving_default': pca_model.apply_pca.get_concrete_function()
    }
)

INFO:tensorflow:Assets written to: pca_model/1/assets


In [12]:

!saved_model_cli show --dir {'pca_model/1/'} --all


MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['__saved_model_init_op']:
  The given SavedModel SignatureDef contains the following input(s):
  The given SavedModel SignatureDef contains the following output(s):
    outputs['__saved_model_init_op'] tensor_info:
        dtype: DT_INVALID
        shape: unknown_rank
        name: NoOp
  Method name is: 

signature_def['serving_default']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['x'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 39)
        name: serving_default_x:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['output_0'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 2)
        name: StatefulPartitionedCall:0
  Method name is: tensorflow/serving/predict
The MetaGraph with tag set ['serve'] contains the following ops: {'MatMul', 'StatefulPartitionedCall', 'ReadVariableOp', 'Placeholder', 'Transpose', '

## Test Load Model Directly

In [13]:
# Load the PCA model
loaded_model = tf.saved_model.load("pca_model/1/")

In [14]:
# Generate a new data point
new_data = np.random.rand(1, 39)
new_data_tf = tf.convert_to_tensor(new_data, dtype=tf.float32)

# Predict the PCA value using the loaded model
pca_value = loaded_model.apply_pca(new_data_tf)

# Print the PCA value
print(pca_value)

tf.Tensor([[-1.733514e+01 -9.536743e-07]], shape=(1, 2), dtype=float32)


In [15]:
# Predict PCA Valuse Using Our Match/Users Datasets
new_data_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

# Predict the PCA value using the loaded model
pca_value = loaded_model.apply_pca(new_data_tf)

# Print the PCA value
print(pca_value)

tf.Tensor(
[[-1.39437504e+01  4.54202533e-01]
 [-1.45697994e+01 -5.31194091e-01]
 [-1.41648922e+01 -3.12957972e-01]
 [-1.51569881e+01 -2.45807320e-01]
 [-1.43135462e+01 -7.82330275e-01]
 [-1.46237698e+01 -4.98418957e-01]
 [-1.46206722e+01  1.45700562e+00]
 [-1.42957802e+01 -2.23677963e-01]
 [-1.48251152e+01  6.07353926e-01]
 [-1.41055727e+01  1.20196462e+00]
 [-1.50505314e+01 -6.47276640e-03]
 [-1.37643089e+01 -1.87686563e-01]
 [-1.47877083e+01  2.91811496e-01]
 [-1.40905161e+01  2.97631443e-01]
 [-1.43425884e+01 -2.47002542e-02]
 [-1.39248114e+01 -1.95534170e-01]
 [-1.43347607e+01  1.06691331e-01]
 [-1.50019331e+01 -4.76971686e-01]
 [-1.48622246e+01  3.41470510e-01]
 [-1.44162731e+01 -4.39514726e-01]
 [-1.33086185e+01  1.02853835e+00]
 [-1.48472261e+01 -2.71165073e-02]
 [-1.46116352e+01  1.82538420e-01]
 [-1.43094902e+01  6.43654943e-01]
 [-1.46728554e+01 -5.09450436e-01]
 [-1.50562620e+01 -4.59727913e-01]
 [-1.44425840e+01  3.58303994e-01]
 [-1.49809952e+01 -7.25632906e-01]
 [-1.4416