In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sys

from sklearn.metrics.pairwise import euclidean_distances

In [19]:
clean_df = pd.read_csv('data/clean_data.csv')
clean_df

Unnamed: 0,Names,Genders,Age Ranges,Specialization,Categories,Math Tutor,Science Tutor,Social Tutor,Technology Tutor,Music Tutor,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
0,Novia Rizki Wulandari,Female,17-25,Flutter,Technology,0,0,0,1,0,...,5,4,4,5,5,4,4,2,4,4
1,Nabhan Nabilah,Female,17-25,Patung,Arts,0,0,0,0,0,...,5,2,5,5,5,2,2,5,5,2
2,Herlina Kusyanuri Putri,Female,17-25,Digital Arts,Arts,0,0,0,0,0,...,5,5,5,5,5,4,3,3,4,3
3,Rifdah Alyaa,Female,17-25,Web Development,Technology,0,0,0,1,0,...,2,3,2,3,4,3,2,3,4,3
4,Salsabilla,Female,17-25,Krita,Arts,0,0,0,0,0,...,4,3,3,4,4,2,2,2,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,Usamah,Male,17-25,Pendidikan Kewarganegaraan,Social,0,0,1,0,0,...,5,5,4,5,5,4,4,5,5,5
97,Helena,Female,17 - 25,Photoshop,Multimedia,0,0,0,0,0,...,4,3,4,4,4,3,4,2,5,4
98,Deva,Female,26-34,Sejarah Dunia,Social,0,0,1,0,0,...,4,4,4,3,4,3,4,2,4,3
99,Dzaky,Male,17-25,Biologi,Science,0,1,0,0,0,...,5,5,5,5,5,5,5,5,4,5


In [20]:
# Check that we not missing any value after combine columns
print('Is there any missing value? ', clean_df.isnull().values.any())
print('How many missing values? ', clean_df.isnull().values.sum())

Is there any missing value?  False
How many missing values?  0


In [21]:
# Copy origin user data into match data to use in cluster
match_df = clean_df.copy()

In [22]:
# Drop Column that not use for clustering match
match_df.drop([
    'Names',
    'Genders',
    'Age Ranges',
    'Specialization',
    'Categories',
    'Math Tutor',
    'Science Tutor',
    'Social Tutor',
    'Technology Tutor',
    'Music Tutor',
    'Arts Tutor',
    'Multimedia Tutor',
    'Language Tutor'
], axis=1, inplace=True)

match_df

Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EST1,EST2,EST3,EST4,EST5,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
0,4,2,4,4,3,3,3,1,2,3,...,5,4,4,5,5,4,4,2,4,4
1,1,3,4,3,2,5,5,5,3,2,...,5,2,5,5,5,2,2,5,5,2
2,3,3,1,1,2,2,4,4,5,3,...,5,5,5,5,5,4,3,3,4,3
3,3,4,2,1,2,3,2,2,4,3,...,2,3,2,3,4,3,2,3,4,3
4,2,4,4,2,2,2,5,4,4,3,...,4,3,3,4,4,2,2,2,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,4,4,5,4,2,2,4,4,4,4,...,5,5,4,5,5,4,4,5,5,5
97,3,4,2,2,2,5,5,4,5,4,...,4,3,4,4,4,3,4,2,5,4
98,4,4,4,4,4,2,3,2,4,3,...,4,4,4,3,4,3,4,2,4,3
99,4,5,5,5,5,1,1,1,1,4,...,5,5,5,5,5,5,5,5,4,5


In [23]:
match_array = match_df.values
match_array.shape

(101, 25)

# Build PCA Model

## Input Size (8 Categories + 25 Answer)

In [24]:
class PCAModel(tf.Module):
    def __init__(self, x_mean, components):
        super(PCAModel, self).__init__()
        self.x_mean = tf.Variable(x_mean, trainable=False)
        self.components = tf.Variable(components, trainable=False)

    @tf.function(input_signature=[tf.TensorSpec(shape=(None, 25), dtype=tf.float32)])
    def scale_features(self, x):
        x_min = tf.reduce_min(x, axis=0)
        x_max = tf.reduce_max(x, axis=0)

        # Handle columns with zero variance
        zero_variance_mask = tf.math.equal(x_min, x_max)
        non_zero_variance_mask = tf.math.logical_not(zero_variance_mask)

        # Scale the columns with non-zero variance
        x_scaled_non_zero = tf.where(non_zero_variance_mask, (x - x_min) / (x_max - x_min), x)

        # Scale the columns with zero variance
        x_scaled_zero = tf.where(zero_variance_mask, tf.zeros_like(x), x_scaled_non_zero)

        return x_scaled_zero

    @tf.function(input_signature=[tf.TensorSpec(shape=(None, 25), dtype=tf.float32)])
    def apply_pca(self, x):
        x_scaled = self.scale_features(x)
        x_centered = x_scaled - self.x_mean
        
        # Compute the covariance matrix
        covariance_matrix = tf.matmul(tf.transpose(x_centered), x_centered) / tf.cast(tf.shape(x_centered)[0], dtype=tf.float32)

        # Perform eigenvalue decomposition
        eigenvalues, eigenvectors = tf.linalg.eigh(covariance_matrix)
        
        # Sort eigenvectors based on eigenvalues
        sorted_indices = tf.argsort(eigenvalues, direction='DESCENDING')
        sorted_eigenvectors = tf.gather(eigenvectors, sorted_indices, axis=1)
        
        # Select the top k eigenvectors
        k = tf.minimum(tf.shape(sorted_eigenvectors)[1], 2)  # Choose the top 2 eigenvectors (modify as needed)
        selected_eigenvectors = sorted_eigenvectors[:, :k]
        
        # Project the centered data onto the selected eigenvectors
        x_pca = tf.matmul(x_centered, selected_eigenvectors)
        
        return x_pca

# Convert the numpy array to a TensorFlow tensor
x_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

# Perform PCA with k=2 (reduce to 2 dimensions)
x_mean = tf.reduce_mean(x_tf, axis=0)
_, _, V = tf.linalg.svd(x_tf - x_mean)
components = V[:, :2]

# Create an instance of the PCA model
pca_model = PCAModel(x_mean, components)

In [25]:
# Save the PCAModel as a SavedModel
tf.saved_model.save(
    pca_model,
    export_dir='pca_matching/1/',
    signatures={
        'serving_default': pca_model.apply_pca.get_concrete_function()
    }
)

INFO:tensorflow:Assets written to: pca_matching/1/assets


In [26]:

!saved_model_cli show --dir {'pca_matching/1/'} --all


MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['__saved_model_init_op']:
  The given SavedModel SignatureDef contains the following input(s):
  The given SavedModel SignatureDef contains the following output(s):
    outputs['__saved_model_init_op'] tensor_info:
        dtype: DT_INVALID
        shape: unknown_rank
        name: NoOp
  Method name is: 

signature_def['serving_default']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['x'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 25)
        name: serving_default_x:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['output_0'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 2)
        name: StatefulPartitionedCall:0
  Method name is: tensorflow/serving/predict
The MetaGraph with tag set ['serve'] contains the following ops: {'Shape', 'RestoreV2', 'Minimum', 'StridedSlice', 'Const', 'Identity', 'Pack', 'Cast'

## Test Load Model Directly

In [27]:
# Load the PCA model
loaded_model = tf.saved_model.load("pca_matching/1/")

In [28]:
# Generate a new data point
new_data = np.random.rand(1, 25)
new_data_tf = tf.convert_to_tensor(new_data, dtype=tf.float32)

# Predict the PCA value using the loaded model
pca_value = loaded_model.apply_pca(new_data_tf)

# Print the PCA value
print(pca_value)

tf.Tensor([[-1.7302851e+01  1.7285347e-06]], shape=(1, 2), dtype=float32)


In [29]:
# Predict PCA Valuse Using Our Match/Users Datasets
new_data_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

# Predict the PCA value using the loaded model
pca_value = loaded_model.apply_pca(new_data_tf)

# Print the PCA value
print(pca_value)

tf.Tensor(
[[-1.39439363e+01 -6.07908845e-01]
 [-1.45699320e+01  4.69947815e-01]
 [-1.41650257e+01  2.57207811e-01]
 [-1.51571732e+01  1.74559951e-01]
 [-1.43136015e+01  5.91525495e-01]
 [-1.46238260e+01  2.65632689e-01]
 [-1.46203651e+01 -1.28318977e+00]
 [-1.42956524e+01  3.33956480e-02]
 [-1.48252792e+01 -7.42141604e-01]
 [-1.41055622e+01 -1.28963482e+00]
 [-1.50503464e+01  1.80672199e-01]
 [-1.37644310e+01 -6.68561459e-03]
 [-1.47875919e+01 -4.49428916e-01]
 [-1.40905056e+01 -1.79685324e-01]
 [-1.43428917e+01  2.04492390e-01]
 [-1.39249744e+01  2.09402472e-01]
 [-1.43344326e+01  2.50321805e-01]
 [-1.50014973e+01  7.13888407e-01]
 [-1.48621006e+01 -2.30364442e-01]
 [-1.44162626e+01  6.29514754e-01]
 [-1.33084812e+01 -9.43474054e-01]
 [-1.48472157e+01  1.18019998e-01]
 [-1.46117573e+01 -3.06396008e-01]
 [-1.43094740e+01 -3.00646216e-01]
 [-1.46729774e+01  4.69605863e-01]
 [-1.50563078e+01  1.06678903e-01]
 [-1.44427176e+01 -5.26892722e-01]
 [-1.49807339e+01  7.82000184e-01]
 [-1.4417

## Build Matchmaking Feature

In [30]:
def predict_pca(match_array):
    # Load the PCA model
    loaded_model = tf.saved_model.load('pca_matching/1/')
    new_data_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

    # Predict the PCA value using the loaded model
    pca_value = loaded_model.apply_pca(new_data_tf)
    pca_value_arr = pca_value.numpy()
    
    return pca_value_arr

In [31]:
def get_guides_idx_filtered(user_id, user_df):
    # Get user destination and persona
    user_destination = user_df.loc[user_id, 'Categories']

    # Filter users with the same destination and roles value of 'guide'
    filtered_users = user_df[(user_df['Categories'] == user_destination)]

    user_indices = []
    for idx in filtered_users.index:
        user_indices.append(idx)
    user_indices = [idx for idx in user_indices if idx != user_id]
            
    return user_indices

In [32]:
def matchmaking(user_id, match_array, user_df):
    
    filtered_user_indices = get_guides_idx_filtered(user_id, user_df)
    if len(filtered_user_indices) == 0:
        print("No Results")
        sys.exit()
        
    user_indices = filtered_user_indices + [user_id]
    
    pca_data = predict_pca(match_array[user_indices])

    distances = []
    for idx in user_indices:
        # Get the distances within the cluster of the specified user

        distance_idx = euclidean_distances([pca_data[user_indices.index(user_id)]], [pca_data[user_indices.index(idx)]])
        distances.append(distance_idx)

    # Normalize the distances within the range of 0-100
    normalized_distances = 1 - distances / np.max(distances)
    scores = normalized_distances * 100
    
    # Reshape the list
    scores = np.reshape(scores, (len(user_indices)))
    
    matches = []
    for i in range(len(filtered_user_indices)):
        matches.append((filtered_user_indices[i], scores[i]))
    
    # Sort the matches based on the highest score
    matches.sort(key=lambda x: x[1], reverse=True)

    print("User Index:", user_id)
    print("Matched Peers:")
    match_idx = [user_id]
    for match_index, score in matches:
        if user_df.iloc[match_index, 2] == user_df.iloc[user_id, 2]:
            match_idx.append(match_index)
            print("Index:", match_index, "| Score:", f"{score:.2f}%")
                
    # Check if Destination and roles is correct 
    print()         
    print(user_df.iloc[match_idx, [0, 1, 2, 3, 4]])
    
    # Check if they get same preference
    print()
    print(user_df.iloc[match_idx, 5:])

In [33]:
matchmaking(0, match_array, clean_df)

User Index: 0
Matched Peers:
Index: 34 | Score: 77.37%
Index: 95 | Score: 46.74%
Index: 28 | Score: 38.69%
Index: 81 | Score: 33.87%
Index: 94 | Score: 29.72%
Index: 3 | Score: 2.83%
Index: 41 | Score: 0.68%

                     Names Genders Age Ranges     Specialization  Categories
0   Novia Rizki Wulandari   Female      17-25            Flutter  Technology
34                    Zeen    Male      17-25  Backend Developer  Technology
95                   Salma  Female      17-25           HTML CSS  Technology
28        Nur Alifia Riany  Female      17-25            Android  Technology
81     Lili Nandita Auliya  Female      17-25      Multiplatform  Technology
94                  Annisa  Female      17-25            Laravel  Technology
3             Rifdah Alyaa  Female      17-25    Web Development  Technology
41                 Gustian    Male      17-25              React  Technology

    Math Tutor  Science Tutor  Social Tutor  Technology Tutor  Music Tutor   
0            0     

In [34]:
matchmaking(58, match_array, clean_df)

User Index: 58
Matched Peers:
Index: 25 | Score: 78.81%
Index: 5 | Score: 68.11%
Index: 83 | Score: 56.30%
Index: 92 | Score: 45.85%
Index: 8 | Score: 41.01%
Index: 13 | Score: 28.18%
Index: 57 | Score: 24.33%

                 Names Genders Age Ranges          Specialization  Categories
58     Alyzar Aviandi     Male      17-25          Penulisan Buku  Multimedia
25              Elma N  Female      17-25                   Excel  Multimedia
5              Maulani  Female      17-25               Fotografi  Multimedia
83          Dhea Setya  Female      17-25  Videografi dan Editing  Multimedia
92                 Féi  Female      17-25            Editing Foto  Multimedia
8                   JJ  Female      17-25              Videografi  Multimedia
13  Bayu Daru Isnandar    Male      17-25            Audio Mixing  Multimedia
57             kartika  Female      17-25        Microsoft Office  Multimedia

    Math Tutor  Science Tutor  Social Tutor  Technology Tutor  Music Tutor   
58      