In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sys

from sklearn.metrics.pairwise import euclidean_distances

In [2]:
clean_df = pd.read_csv('data/clean_data.csv')
clean_df

Unnamed: 0,Names,Genders,Age Ranges,Specialization,Categories,Math Tutor,Science Tutor,Social Tutor,Technology Tutor,Music Tutor,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
0,Novia Rizki Wulandari,1,17-25,Flutter,Technology,0,0,0,1,0,...,5,4,4,5,5,4,4,2,4,4
1,Nabhan Nabilah,1,17-25,Patung,Arts,0,0,0,0,0,...,5,2,5,5,5,2,2,5,5,2
2,Herlina Kusyanuri Putri,1,17-25,Digital Arts,Arts,0,0,0,0,0,...,5,5,5,5,5,4,3,3,4,3
3,Rifdah Alyaa,1,17-25,Web Development,Technology,0,0,0,1,0,...,2,3,2,3,4,3,2,3,4,3
4,Salsabilla,1,17-25,Krita,Arts,0,0,0,0,0,...,4,3,3,4,4,2,2,2,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,Usamah,0,17-25,Pendidikan Kewarganegaraan,Social,0,0,1,0,0,...,5,5,4,5,5,4,4,5,5,5
97,Helena,1,17 - 25,Photoshop,Multimedia,0,0,0,0,0,...,4,3,4,4,4,3,4,2,5,4
98,Deva,1,26-34,Sejarah Dunia,Social,0,0,1,0,0,...,4,4,4,3,4,3,4,2,4,3
99,Dzaky,0,17-25,Biologi,Science,0,1,0,0,0,...,5,5,5,5,5,5,5,5,4,5


In [3]:
# Check that we not missing any value after combine columns
print('Is there any missing value? ', clean_df.isnull().values.any())
print('How many missing values? ', clean_df.isnull().values.sum())

Is there any missing value?  False
How many missing values?  0


In [4]:
# Copy origin user data into match data to use in cluster
match_df = clean_df.copy()

In [5]:
# Drop Column that not use for clustering match
match_df.drop([
    'Names',
    'Age Ranges',
    'Specialization',
    'Categories',
    'Math Tutor',
    'Science Tutor',
    'Social Tutor',
    'Technology Tutor',
    'Music Tutor',
    'Arts Tutor',
    'Multimedia Tutor',
    'Language Tutor'
], axis=1, inplace=True)

match_df

Unnamed: 0,Genders,EXT1,EXT2,EXT3,EXT4,EXT5,EST1,EST2,EST3,EST4,...,CSN1,CSN2,CSN3,CSN4,CSN5,OPN1,OPN2,OPN3,OPN4,OPN5
0,1,4,2,4,4,3,3,3,1,2,...,5,4,4,5,5,4,4,2,4,4
1,1,1,3,4,3,2,5,5,5,3,...,5,2,5,5,5,2,2,5,5,2
2,1,3,3,1,1,2,2,4,4,5,...,5,5,5,5,5,4,3,3,4,3
3,1,3,4,2,1,2,3,2,2,4,...,2,3,2,3,4,3,2,3,4,3
4,1,2,4,4,2,2,2,5,4,4,...,4,3,3,4,4,2,2,2,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0,4,4,5,4,2,2,4,4,4,...,5,5,4,5,5,4,4,5,5,5
97,1,3,4,2,2,2,5,5,4,5,...,4,3,4,4,4,3,4,2,5,4
98,1,4,4,4,4,4,2,3,2,4,...,4,4,4,3,4,3,4,2,4,3
99,0,4,5,5,5,5,1,1,1,1,...,5,5,5,5,5,5,5,5,4,5


In [6]:
match_array = match_df.values
match_array.shape

(101, 26)

# Build PCA Model

## Input Size (1 Genders + 25 Answer)

In [7]:
class PCAModel(tf.Module):
    def __init__(self, x_mean, components):
        super(PCAModel, self).__init__()
        self.x_mean = tf.Variable(x_mean, trainable=False)
        self.components = tf.Variable(components, trainable=False)

    @tf.function(input_signature=[tf.TensorSpec(shape=(None, 26), dtype=tf.float32)])
    def scale_features(self, x):
        x_min = tf.reduce_min(x, axis=0)
        x_max = tf.reduce_max(x, axis=0)

        # Handle columns with zero variance
        zero_variance_mask = tf.math.equal(x_min, x_max)
        non_zero_variance_mask = tf.math.logical_not(zero_variance_mask)

        # Scale the columns with non-zero variance
        x_scaled_non_zero = tf.where(non_zero_variance_mask, (x - x_min) / (x_max - x_min), x)

        # Scale the columns with zero variance
        x_scaled_zero = tf.where(zero_variance_mask, tf.zeros_like(x), x_scaled_non_zero)

        return x_scaled_zero

    @tf.function(input_signature=[tf.TensorSpec(shape=(None, 26), dtype=tf.float32)])
    def apply_pca(self, x):
        x_scaled = self.scale_features(x)
        x_centered = x_scaled - self.x_mean
        
        # Compute the covariance matrix
        covariance_matrix = tf.matmul(tf.transpose(x_centered), x_centered) / tf.cast(tf.shape(x_centered)[0], dtype=tf.float32)

        # Perform eigenvalue decomposition
        eigenvalues, eigenvectors = tf.linalg.eigh(covariance_matrix)
        
        # Sort eigenvectors based on eigenvalues
        sorted_indices = tf.argsort(eigenvalues, direction='DESCENDING')
        sorted_eigenvectors = tf.gather(eigenvectors, sorted_indices, axis=1)
        
        # Select the top k eigenvectors
        k = tf.minimum(tf.shape(sorted_eigenvectors)[1], 2)  # Choose the top 2 eigenvectors (modify as needed)
        selected_eigenvectors = sorted_eigenvectors[:, :k]
        
        # Project the centered data onto the selected eigenvectors
        x_pca = tf.matmul(x_centered, selected_eigenvectors)
        
        return x_pca

# Convert the numpy array to a TensorFlow tensor
x_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

# Perform PCA with k=2 (reduce to 2 dimensions)
x_mean = tf.reduce_mean(x_tf, axis=0)
_, _, V = tf.linalg.svd(x_tf - x_mean)
components = V[:, :2]

# Create an instance of the PCA model
pca_model = PCAModel(x_mean, components)

In [8]:
# Save the PCAModel as a SavedModel
tf.saved_model.save(
    pca_model,
    export_dir='pca_matching/1/',
    signatures={
        'serving_default': pca_model.apply_pca.get_concrete_function()
    }
)

INFO:tensorflow:Assets written to: pca_matching/1/assets


In [9]:

!saved_model_cli show --dir {'pca_matching/1/'} --all


MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['__saved_model_init_op']:
  The given SavedModel SignatureDef contains the following input(s):
  The given SavedModel SignatureDef contains the following output(s):
    outputs['__saved_model_init_op'] tensor_info:
        dtype: DT_INVALID
        shape: unknown_rank
        name: NoOp
  Method name is: 

signature_def['serving_default']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['x'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 26)
        name: serving_default_x:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['output_0'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 2)
        name: StatefulPartitionedCall:0
  Method name is: tensorflow/serving/predict
The MetaGraph with tag set ['serve'] contains the following ops: {'Min', 'MatMul', 'RealDiv', 'Max', 'Identity', 'StatefulPartitionedCall', 'ReadVaria

## Test Load Model Directly

In [10]:
# Load the PCA model
loaded_model = tf.saved_model.load("pca_matching/1/")

In [11]:
# Generate a new data point
new_data = np.random.rand(1, 26)
new_data_tf = tf.convert_to_tensor(new_data, dtype=tf.float32)

# Predict the PCA value using the loaded model
pca_value = loaded_model.apply_pca(new_data_tf)

# Print the PCA value
print(pca_value)

tf.Tensor([[-1.7317123e+01 -2.3841858e-07]], shape=(1, 2), dtype=float32)


In [12]:
# Predict PCA Valuse Using Our Match/Users Datasets
new_data_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

# Predict the PCA value using the loaded model
pca_value = loaded_model.apply_pca(new_data_tf)

# Print the PCA value
print(pca_value)

tf.Tensor(
[[-1.39438839e+01  5.32377958e-01]
 [-1.45698795e+01 -5.26659250e-01]
 [-1.41649723e+01 -3.27734053e-01]
 [-1.51571198e+01 -2.32190311e-01]
 [-1.43135509e+01 -6.38926327e-01]
 [-1.46237736e+01 -3.23504448e-01]
 [-1.46204863e+01  1.39741337e+00]
 [-1.42957735e+01  9.24311876e-02]
 [-1.48252268e+01  6.90632463e-01]
 [-1.41056833e+01  1.39918756e+00]
 [-1.50502930e+01 -2.03321099e-01]
 [-1.37643776e+01 -4.74078655e-02]
 [-1.47877121e+01  5.62712073e-01]
 [-1.40906258e+01  3.27989101e-01]
 [-1.43428383e+01 -2.48354018e-01]
 [-1.39249220e+01 -2.66317964e-01]
 [-1.43345509e+01 -1.28089309e-01]
 [-1.50016174e+01 -5.69202483e-01]
 [-1.48620472e+01  1.53260112e-01]
 [-1.44163818e+01 -4.84670997e-01]
 [-1.33084288e+01  8.72605085e-01]
 [-1.48473377e+01 -1.54488087e-02]
 [-1.46117048e+01  2.48815417e-01]
 [-1.43094215e+01  2.42372155e-01]
 [-1.46729240e+01 -5.11651695e-01]
 [-1.50562553e+01 -1.63586378e-01]
 [-1.44426651e+01  4.61732030e-01]
 [-1.49806805e+01 -8.21066499e-01]
 [-1.4416

## Build Matchmaking Feature

In [13]:
def predict_pca(match_array):
    # Load the PCA model
    loaded_model = tf.saved_model.load('pca_matching/1/')
    new_data_tf = tf.convert_to_tensor(match_array, dtype=tf.float32)

    # Predict the PCA value using the loaded model
    pca_value = loaded_model.apply_pca(new_data_tf)
    pca_value_arr = pca_value.numpy()
    
    return pca_value_arr

In [14]:
def get_guides_idx_filtered(user_id, user_df):
    # Get user destination and persona
    user_destination = user_df.loc[user_id, 'Categories']

    # Filter users with the same destination and roles value of 'guide'
    filtered_users = user_df[(user_df['Categories'] == user_destination)]

    user_indices = []
    for idx in filtered_users.index:
        user_indices.append(idx)
    user_indices = [idx for idx in user_indices if idx != user_id]
            
    return user_indices

In [15]:
def matchmaking(user_id, match_array, user_df):
    
    filtered_user_indices = get_guides_idx_filtered(user_id, user_df)
    if len(filtered_user_indices) == 0:
        print("No Results")
        sys.exit()
        
    user_indices = filtered_user_indices + [user_id]
    
    pca_data = predict_pca(match_array[user_indices])

    distances = []
    for idx in user_indices:
        # Get the distances within the cluster of the specified user

        distance_idx = euclidean_distances([pca_data[user_indices.index(user_id)]], [pca_data[user_indices.index(idx)]])
        distances.append(distance_idx)

    # Normalize the distances within the range of 0-100
    normalized_distances = 1 - distances / np.max(distances)
    scores = normalized_distances * 100
    
    # Reshape the list
    scores = np.reshape(scores, (len(user_indices)))
    
    matches = []
    for i in range(len(filtered_user_indices)):
        matches.append((filtered_user_indices[i], scores[i]))
    
    # Sort the matches based on the highest score
    matches.sort(key=lambda x: x[1], reverse=True)

    print("User Index:", user_id)
    print("Matched Peers:")
    match_idx = [user_id]
    for match_index, score in matches:
        if user_df.iloc[match_index, 2] == user_df.iloc[user_id, 2]:
            match_idx.append(match_index)
            print("Index:", match_index, "| Score:", f"{score:.2f}%")
                
    # Check if Destination and roles is correct 
    print()         
    print(user_df.iloc[match_idx, [0, 1, 3, 4]])
    
    # Check if they get same preference
    print()
    print(user_df.iloc[match_idx, 5:])

In [16]:
matchmaking(0, match_array, clean_df)

User Index: 0
Matched Peers:
Index: 34 | Score: 78.64%
Index: 95 | Score: 49.28%
Index: 28 | Score: 38.01%
Index: 81 | Score: 32.34%
Index: 94 | Score: 29.46%
Index: 41 | Score: 6.40%
Index: 3 | Score: 3.48%

                     Names  Genders     Specialization  Categories
0   Novia Rizki Wulandari         1            Flutter  Technology
34                    Zeen        0  Backend Developer  Technology
95                   Salma        1           HTML CSS  Technology
28        Nur Alifia Riany        1            Android  Technology
81     Lili Nandita Auliya        1      Multiplatform  Technology
94                  Annisa        1            Laravel  Technology
41                 Gustian        0              React  Technology
3             Rifdah Alyaa        1    Web Development  Technology

    Math Tutor  Science Tutor  Social Tutor  Technology Tutor  Music Tutor   
0            0              0             0                 1            0  \
34           0              0  

In [17]:
matchmaking(58, match_array, clean_df)

User Index: 58
Matched Peers:
Index: 25 | Score: 91.07%
Index: 5 | Score: 74.95%
Index: 83 | Score: 57.54%
Index: 92 | Score: 55.38%
Index: 8 | Score: 51.84%
Index: 57 | Score: 27.30%
Index: 13 | Score: 26.46%

                 Names  Genders          Specialization  Categories
58     Alyzar Aviandi         0          Penulisan Buku  Multimedia
25              Elma N        1                   Excel  Multimedia
5              Maulani        1               Fotografi  Multimedia
83          Dhea Setya        1  Videografi dan Editing  Multimedia
92                 Féi        1            Editing Foto  Multimedia
8                   JJ        1              Videografi  Multimedia
57             kartika        1        Microsoft Office  Multimedia
13  Bayu Daru Isnandar        0            Audio Mixing  Multimedia

    Math Tutor  Science Tutor  Social Tutor  Technology Tutor  Music Tutor   
58           0              0             0                 0            0  \
25           0      