In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sys

from sklearn.metrics.pairwise import euclidean_distances

In [2]:
clean_df = pd.read_csv('data/clean_data.csv')
clean_df

Unnamed: 0,Names,Genders,City,17-25,26-34,35-43,44-52,52+,Math Tutor,Science Tutor,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
0,Novia Rizki Wulandari,1,Bandung,1,0,0,0,0,0,0,...,5,4,4,5,5,4,4,2,4,4
1,Nabhan Nabilah,1,Bandung,1,0,0,0,0,0,0,...,5,2,5,5,5,2,2,5,5,2
2,Herlina Kusyanuri Putri,1,Bandung,1,0,0,0,0,0,0,...,5,5,5,5,5,4,3,3,4,3
3,Rifdah Alyaa,1,Makassar,1,0,0,0,0,1,0,...,2,3,2,3,4,3,2,3,4,3
4,Salsabilla,1,Bandung,1,0,0,0,0,0,1,...,4,3,3,4,4,2,2,2,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,Usamah,0,Bandung,1,0,0,0,0,0,1,...,5,5,4,5,5,4,4,5,5,5
97,Helena,1,Bandung,0,0,0,0,0,0,0,...,4,3,4,4,4,3,4,2,5,4
98,Deva,1,Bandung,0,1,0,0,0,0,0,...,4,4,4,3,4,3,4,2,4,3
99,Dzaky,0,Bandung,1,0,0,0,0,1,0,...,5,5,5,5,5,5,5,5,4,5


In [3]:
# Check that we not missing any value after combine columns
print('Is there any missing value? ', clean_df.isnull().values.any())
print('How many missing values? ', clean_df.isnull().values.sum())

Is there any missing value?  False
How many missing values?  0


In [4]:
# Copy origin user data into match data to use in cluster
match_df = clean_df.copy()

In [5]:
# Drop Column that not use for clustering match
match_df.drop(['Names', 'City'], axis=1, inplace=True)
match_df

Unnamed: 0,Genders,17-25,26-34,35-43,44-52,52+,Math Tutor,Science Tutor,Social Tutor,Technology Tutor,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
0,1,1,0,0,0,0,0,0,0,0,...,5,4,4,5,5,4,4,2,4,4
1,1,1,0,0,0,0,0,0,0,1,...,5,2,5,5,5,2,2,5,5,2
2,1,1,0,0,0,0,0,0,0,1,...,5,5,5,5,5,4,3,3,4,3
3,1,1,0,0,0,0,1,0,0,0,...,2,3,2,3,4,3,2,3,4,3
4,1,1,0,0,0,0,0,1,0,0,...,4,3,3,4,4,2,2,2,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0,1,0,0,0,0,0,1,0,0,...,5,5,4,5,5,4,4,5,5,5
97,1,0,0,0,0,0,0,0,0,1,...,4,3,4,4,4,3,4,2,5,4
98,1,0,1,0,0,0,0,0,0,0,...,4,4,4,3,4,3,4,2,4,3
99,0,1,0,0,0,0,1,0,0,0,...,5,5,5,5,5,5,5,5,4,5


In [6]:
match_array = match_df.values
match_array.shape

(101, 39)

In [7]:
class PCAModel(tf.Module):
    def __init__(self, x_mean, components):
        super(PCAModel, self).__init__()
        self.x_mean = tf.Variable(x_mean, trainable=False)
        self.components = tf.Variable(components, trainable=False)

    @tf.function(input_signature=[tf.TensorSpec(shape=(None, 39), dtype=tf.float32)])
    def scale_features(self, x):
        x_min = tf.reduce_min(x, axis=0)
        x_max = tf.reduce_max(x, axis=0)

        # Handle columns with zero variance
        zero_variance_mask = tf.math.equal(x_min, x_max)
        non_zero_variance_mask = tf.math.logical_not(zero_variance_mask)

        # Scale the columns with non-zero variance
        x_scaled_non_zero = tf.where(non_zero_variance_mask, (x - x_min) / (x_max - x_min), x)

        # Scale the columns with zero variance
        x_scaled_zero = tf.where(zero_variance_mask, tf.zeros_like(x), x_scaled_non_zero)

        return x_scaled_zero

    @tf.function(input_signature=[tf.TensorSpec(shape=(None, 39), dtype=tf.float32)])
    def apply_pca(self, x):
        x_scaled = self.scale_features(x)
        x_centered = x_scaled - self.x_mean
        
        # Compute the covariance matrix
        covariance_matrix = tf.matmul(tf.transpose(x_centered), x_centered) / tf.cast(tf.shape(x_centered)[0], dtype=tf.float32)

        # Perform eigenvalue decomposition
        eigenvalues, eigenvectors = tf.linalg.eigh(covariance_matrix)
        
        # Sort eigenvectors based on eigenvalues
        sorted_indices = tf.argsort(eigenvalues, direction='DESCENDING')
        sorted_eigenvectors = tf.gather(eigenvectors, sorted_indices, axis=1)
        
        # Select the top k eigenvectors
        k = tf.minimum(tf.shape(sorted_eigenvectors)[1], 2)  # Choose the top 2 eigenvectors (modify as needed)
        selected_eigenvectors = sorted_eigenvectors[:, :k]
        
        # Project the centered data onto the selected eigenvectors
        x_pca = tf.matmul(x_centered, selected_eigenvectors)
        
        return x_pca

# Convert the numpy array to a TensorFlow tensor
x_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

# Perform PCA with k=2 (reduce to 2 dimensions)
x_mean = tf.reduce_mean(x_tf, axis=0)
_, _, V = tf.linalg.svd(x_tf - x_mean)
components = V[:, :2]

# Create an instance of the PCA model
pca_model = PCAModel(x_mean, components)

In [8]:
# Save the PCAModel as a SavedModel
tf.saved_model.save(
    pca_model,
    export_dir='pca_model/1/',
    signatures={
        'serving_default': pca_model.apply_pca.get_concrete_function()
    }
)

INFO:tensorflow:Assets written to: pca_model/1/assets


In [9]:

!saved_model_cli show --dir {'pca_model/1/'} --all


MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['__saved_model_init_op']:
  The given SavedModel SignatureDef contains the following input(s):
  The given SavedModel SignatureDef contains the following output(s):
    outputs['__saved_model_init_op'] tensor_info:
        dtype: DT_INVALID
        shape: unknown_rank
        name: NoOp
  Method name is: 

signature_def['serving_default']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['x'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 39)
        name: serving_default_x:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['output_0'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 2)
        name: StatefulPartitionedCall:0
  Method name is: tensorflow/serving/predict
The MetaGraph with tag set ['serve'] contains the following ops: {'Const', 'Shape', 'AssignVariableOp', 'Max', 'StringJoin', 'TopKV2', 'Identity', 'Sh

## Test Load Model Directly

In [10]:
# Load the PCA model
loaded_model = tf.saved_model.load("pca_model/1/")

In [11]:
# Generate a new data point
new_data = np.random.rand(1, 39)
new_data_tf = tf.convert_to_tensor(new_data, dtype=tf.float32)

# Predict the PCA value using the loaded model
pca_value = loaded_model.apply_pca(new_data_tf)

# Print the PCA value
print(pca_value)

tf.Tensor([[-1.7335190e+01  2.6524067e-06]], shape=(1, 2), dtype=float32)


In [12]:
# Predict PCA Valuse Using Our Match/Users Datasets
new_data_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

# Predict the PCA value using the loaded model
pca_value = loaded_model.apply_pca(new_data_tf)

# Print the PCA value
print(pca_value)

tf.Tensor(
[[-1.39438353e+01  4.21419501e-01]
 [-1.45698500e+01 -4.47724134e-01]
 [-1.41649437e+01 -2.44954735e-01]
 [-1.51571522e+01 -3.82503569e-02]
 [-1.43133917e+01 -6.08577013e-01]
 [-1.46236153e+01 -2.65332133e-01]
 [-1.46207762e+01  1.42164552e+00]
 [-1.42956161e+01  4.31449115e-02]
 [-1.48252583e+01  8.18783879e-01]
 [-1.41057148e+01  1.43509495e+00]
 [-1.50506086e+01 -1.12853646e-02]
 [-1.37643299e+01 -1.34255558e-01]
 [-1.47876644e+01  4.31703418e-01]
 [-1.40904360e+01  8.56026411e-02]
 [-1.43425884e+01 -2.86460727e-01]
 [-1.39249535e+01 -7.24658370e-02]
 [-1.43348408e+01  5.68398833e-03]
 [-1.50018883e+01 -4.42143857e-01]
 [-1.48620548e+01  3.70149344e-01]
 [-1.44161930e+01 -6.55394912e-01]
 [-1.33086843e+01  1.05688298e+00]
 [-1.48471441e+01 -2.07924277e-01]
 [-1.46115150e+01  3.00621092e-02]
 [-1.43097115e+01  4.10153985e-01]
 [-1.46728382e+01 -7.12159991e-01]
 [-1.50560980e+01 -1.21056169e-01]
 [-1.44426155e+01  3.66256624e-01]
 [-1.49809132e+01 -8.32653761e-01]
 [-1.4416

## Build Matchmaking Feature

In [13]:
def predict_pca(match_array):
    # Load the PCA model
    loaded_model = tf.saved_model.load('pca_model/1/')
    new_data_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

    # Predict the PCA value using the loaded model
    pca_value = loaded_model.apply_pca(new_data_tf)
    pca_value_arr = pca_value.numpy()
    
    return pca_value_arr

In [14]:
def get_guides_idx_filtered(user_id, user_df):
    # Get user destination and persona
    user_destination = user_df.loc[user_id, 'City']

    # Filter users with the same destination and roles value of 'guide'
    filtered_users = user_df[(user_df['City'] == user_destination)]

    user_indices = []
    for idx in filtered_users.index:
        user_indices.append(idx)
            
    return user_indices

In [15]:
def matchmaking(user_id, match_array, user_df):
    
    filtered_user_indices = get_guides_idx_filtered(user_id, user_df)
    if len(filtered_user_indices) == 0:
        print("No Results")
        sys.exit()
        
    user_indices = filtered_user_indices + [user_id]
    
    pca_data = predict_pca(match_array[user_indices])

    distances = []
    for idx in user_indices:
        # Get the distances within the cluster of the specified user

        distance_idx = euclidean_distances([pca_data[user_indices.index(user_id)]], [pca_data[user_indices.index(idx)]])
        distances.append(distance_idx)

    # Normalize the distances within the range of 0-100
    normalized_distances = 1 - distances / np.max(distances)
    scores = normalized_distances * 100
    
    # Reshape the list
    scores = np.reshape(scores, (len(user_indices)))
    
    matches = []
    for i in range(len(filtered_user_indices)):
        matches.append((filtered_user_indices[i], scores[i]))
    
    # Sort the matches based on the highest score
    matches.sort(key=lambda x: x[1], reverse=True)

    print("User Index:", user_id)
    print("Matched Peers:")
    match_idx = [user_id]
    for match_index, score in matches:
        if user_df.iloc[match_index, 2] == user_df.iloc[user_id, 2]:
            match_idx.append(match_index)
            print("Index:", match_index, "| Score:", f"{score:.2f}%")
                
    # Check if Destination and roles is correct 
    print()         
    print(user_df.iloc[match_idx, [0, 1, 2]])
    
    # Check if they get same preference
    print()
    print(user_df.iloc[match_idx, 8:16])

In [16]:
matchmaking(0, match_array, clean_df)

User Index: 0
Matched Peers:
Index: 0 | Score: 100.00%
Index: 93 | Score: 96.81%
Index: 78 | Score: 91.74%
Index: 63 | Score: 84.48%
Index: 98 | Score: 84.29%
Index: 95 | Score: 83.38%
Index: 47 | Score: 82.77%
Index: 48 | Score: 82.53%
Index: 29 | Score: 81.92%
Index: 69 | Score: 78.16%
Index: 56 | Score: 77.95%
Index: 80 | Score: 71.72%
Index: 49 | Score: 71.07%
Index: 75 | Score: 69.33%
Index: 92 | Score: 68.19%
Index: 40 | Score: 67.39%
Index: 50 | Score: 66.47%
Index: 65 | Score: 66.02%
Index: 73 | Score: 65.63%
Index: 43 | Score: 64.53%
Index: 96 | Score: 62.67%
Index: 55 | Score: 62.59%
Index: 64 | Score: 62.47%
Index: 72 | Score: 61.39%
Index: 62 | Score: 59.76%
Index: 22 | Score: 59.47%
Index: 85 | Score: 59.35%
Index: 37 | Score: 55.87%
Index: 5 | Score: 52.81%
Index: 30 | Score: 52.60%
Index: 2 | Score: 52.52%
Index: 46 | Score: 51.97%
Index: 83 | Score: 51.91%
Index: 90 | Score: 51.64%
Index: 8 | Score: 50.29%
Index: 89 | Score: 49.86%
Index: 6 | Score: 49.50%
Index: 77 | S

In [17]:
matchmaking(58, match_array, clean_df)

User Index: 58
Matched Peers:
Index: 58 | Score: 100.00%
Index: 17 | Score: 82.16%
Index: 41 | Score: 81.25%
Index: 19 | Score: 77.44%
Index: 54 | Score: 75.42%
Index: 21 | Score: 71.93%
Index: 27 | Score: 69.69%
Index: 39 | Score: 68.89%
Index: 51 | Score: 68.88%
Index: 68 | Score: 66.54%
Index: 13 | Score: 62.68%
Index: 94 | Score: 62.02%
Index: 12 | Score: 62.01%
Index: 34 | Score: 61.47%
Index: 38 | Score: 61.08%
Index: 7 | Score: 60.66%
Index: 100 | Score: 58.71%
Index: 32 | Score: 58.41%
Index: 81 | Score: 56.55%
Index: 91 | Score: 56.43%
Index: 52 | Score: 55.11%
Index: 3 | Score: 54.57%
Index: 28 | Score: 54.06%
Index: 67 | Score: 53.42%
Index: 82 | Score: 52.20%
Index: 14 | Score: 51.42%
Index: 70 | Score: 51.22%
Index: 16 | Score: 51.07%
Index: 53 | Score: 49.47%
Index: 31 | Score: 48.87%
Index: 59 | Score: 48.45%
Index: 33 | Score: 46.80%
Index: 42 | Score: 40.99%
Index: 15 | Score: 39.91%
Index: 26 | Score: 37.19%
Index: 11 | Score: 37.14%
Index: 71 | Score: 35.88%
Index: 3