<a href="https://colab.research.google.com/github/Jackson00Han/Datasets/blob/master/Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

User-User Matching Algorithm

1. basic similarity score calculation algorithm
2. content-based filtering algorithm
3. collaborative filtering algorithm
4. hybrid system

In [1]:
# Import necessary libraries
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean

import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
tf.random.set_seed(1)

In [3]:


# Set seed for reproducibility (so results are consistent when generating random values)
np.random.seed(1)

# Step 1: Generate synthetic user data

num_samples = 100  # Define the number of user profiles

# Generate random data for each column (attributes of each user)
names = [f"User_{i}" for i in range(1, num_samples + 1)]  # User names
ages = np.random.randint(18, 65, size=num_samples)  # Ages between 18 and 64
nationalities = np.random.choice(['USA', 'Canada', 'UK', 'Germany', 'France', 'India', 'China'], size=num_samples)
languages = np.random.choice(['English', 'French', 'German', 'Spanish', 'Hindi', 'Chinese'], size=num_samples)
residence_countries = np.random.choice(['USA', 'Canada', 'UK', 'Germany', 'France', 'India', 'China'], size=num_samples)
postal_codes = np.random.randint(10000, 99999, size=num_samples)  # Random postal codes
occupations = np.random.choice(['Engineer', 'Artist', 'Doctor', 'Lawyer', 'Teacher', 'Entrepreneur'], size=num_samples)
marital_statuses = np.random.choice(['Single', 'Married', 'Divorced'], size=num_samples)
books = np.random.choice(['Fiction', 'Non-fiction', 'Sci-Fi', 'Fantasy', 'Biography', 'History'], size=num_samples)
music = np.random.choice(['Rock', 'Jazz', 'Classical', 'Pop', 'Hip-hop', 'Country'], size=num_samples)
activity_levels = np.random.randint(1, 11, size=num_samples)  # Activity levels between 1 and 10
mana_levels = np.random.randint(0, 101, size=num_samples)  # MANA levels between 0 and 100

# Step 2: Create a DataFrame to store the user data

user_data = pd.DataFrame({
    'Name': names,
    'Age': ages,
    'Nationality': nationalities,
    'Language': languages,
    'Residence Country': residence_countries,
    'Postal Code': postal_codes,
    'Occupation': occupations,
    'Marital Status': marital_statuses,
    'Favorite Book Genre': books,
    'Favorite Music Genre': music,
    'Activity Level': activity_levels,
    'MANA': mana_levels  # Adding the 'MANA' column, which is a numeric feature
})



# Step 3: Preprocess and encode categorical features

# List of categorical columns to encode
categorical_columns = ['Nationality', 'Language', 'Residence Country', 'Occupation', 'Marital Status',
                       'Favorite Book Genre', 'Favorite Music Genre']

# Initialize LabelEncoders to convert categorical values into numeric labels
label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    user_data[col + '_encoded'] = label_encoders[col].fit_transform(user_data[col])

# Step 4: Prepare features for similarity calculation

# Numeric features for Euclidean distance
numeric_features = ['Age', 'Activity Level', 'MANA']

# Encoded categorical features for Cosine and Jaccard similarities
categorical_features = [col + '_encoded' for col in categorical_columns]

# Step 5: Define similarity functions

# Function to calculate Euclidean similarity for numeric features
def euclidean_similarity(user1_data, user2_data, features):
    # Euclidean distance is converted to similarity by using 1 / (1 + distance)
    return 1 / (1 + euclidean(user1_data[features], user2_data[features]))

# Function to calculate Cosine similarity for categorical features
def cosine_similarity_features(user1_data, user2_data, features):
    # Cosine similarity compares the angles between vectors of encoded features
    return cosine_similarity([user1_data[features]], [user2_data[features]])[0][0]

# One-hot encode categorical features for Jaccard similarity
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(user_data[categorical_features])

# Function to calculate Jaccard similarity using one-hot encoded data
def jaccard_similarity_onehot(user1_idx, user2_idx, onehot_encoded_data):
    # Compare one-hot encoded vectors to measure set similarity
    user1 = onehot_encoded_data[user1_idx]
    user2 = onehot_encoded_data[user2_idx]
    intersection = np.sum(np.minimum(user1, user2))
    union = np.sum(np.maximum(user1, user2))
    return intersection / union if union != 0 else 0

# Step 6: Define a function to calculate the overall similarity

def calculate_similarity(user1_idx, user2_idx, user_data, onehot_encoded_data):
    # Extract the data for the two users being compared
    user1_data = user_data.iloc[user1_idx]
    user2_data = user_data.iloc[user2_idx]

    # Compute Euclidean similarity on numeric features
    euclidean_sim = euclidean_similarity(user1_data, user2_data, numeric_features)

    # Compute Cosine similarity on encoded categorical features
    cosine_sim = cosine_similarity_features(user1_data, user2_data, categorical_features)

    # Compute Jaccard similarity on one-hot encoded categorical data
    jaccard_sim = jaccard_similarity_onehot(user1_idx, user2_idx, onehot_encoded_data)

    # Combine the three similarity measures into an overall similarity score
    overall_similarity = np.mean([euclidean_sim, cosine_sim, jaccard_sim])

    return overall_similarity

# Step 7: Function to compute similarities and sort them in descending order

def find_most_similar_users(target_user_idx, user_data, onehot_encoded_data, numeric_features, categorical_features,top_n):
    similarities = []

    # Loop through all users and calculate the similarity with the target user
    for user_idx in range(len(user_data)):
        if user_idx != target_user_idx:  # Skip the target user itself
            sim_score = calculate_similarity(target_user_idx, user_idx, user_data, onehot_encoded_data)  # Call the similarity function
            similarities.append((user_idx, sim_score))  # Store user index and similarity score

    # Sort the list of tuples by similarity score in descending order
    sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    return sorted_similarities[:top_n]

def recommend_for_all_users_dataframe(user_data, onehot_encoded_data, numeric_features, categorical_features, top_n):
    # Create a list to hold results for each user
    all_recommendations = []

    # Loop through each user and get the top N similar users for that user
    for user_idx in range(len(user_data)):
        top_similar_users = find_most_similar_users(user_idx, user_data, onehot_encoded_data, numeric_features, categorical_features, top_n)
        # Extract only the user indices (first element of each tuple) from the list of tuples
        top_similar_user_indices = [user[0] for user in top_similar_users]
        # Append the target user and their top N similar users as a row
        all_recommendations.append([user_idx] + top_similar_user_indices)

    # Create column names for the DataFrame
    column_names = ['User'] + [f'Top_{i+1}_Similar_User' for i in range(top_n)]

    # Convert the list of recommendations into a pandas DataFrame
    recommendations_df = pd.DataFrame(all_recommendations, columns=column_names)

    return recommendations_df

# Example: Find the top 3 most similar users for all users and display in a DataFrame
recommendations_df = recommend_for_all_users_dataframe(user_data, onehot_encoded, numeric_features, categorical_features, top_n=4)



Let's simulate a scenario where, after using a simple algorithm to calculate user similarity and recommend the top 10 people for each user, feedback is collected a few days later. Based on the feedback, each user shows interest in 0 to 2 other users. The interest is generated by assuming it is distributed as follows:

90% of the users they are interested in come from the list of 10 recommended users.

10% of the users they are interested in come from other users outside the recommended list.

In [4]:
# Second Phase for further matching
df = user_data.select_dtypes(exclude=['object'])

def generate_interest_labels(num_users, recommendations_df):
    interest_labels = {}

    for user_idx in range(num_users):
        # Get the list of recommended users for the current user
        recommended_users = recommendations_df.iloc[user_idx, 1:].values.tolist()  # Get recommended users

        # Select the top 1 user from the recommended list (the first user in the list)
        top_recommended_user = recommended_users[0]  # The highest-ranked user is the first one

        # Assign this top recommended user as the interested user for this user
        interest_labels[user_idx] = [top_recommended_user]  # Keep it as a list to maintain consistency

    return interest_labels

# Generate interest labels for 100 users
interest_labels = generate_interest_labels(100, recommendations_df)



def construct_double_tower_input(user_data, interest_labels, num_users):
    X_user = []
    X_recommended = []
    y_train = []

    # Iterate over all users and their interest labels
    for user_idx in range(num_users):
        for recommended_idx in range(num_users):
            if user_idx != recommended_idx:  # Skip self-pairing
                # Append user features and recommended user features
                X_user.append(user_data.iloc[user_idx].values)
                X_recommended.append(user_data.iloc[recommended_idx].values)

                # Label is 1 if the user is interested in the recommended user, 0 otherwise
                y_train.append(1 if recommended_idx in interest_labels[user_idx] else 0)

    # Convert lists to NumPy arrays
    X_user = np.array(X_user)
    X_recommended = np.array(X_recommended)
    y_train = np.array(y_train)

    return X_user, X_recommended, y_train
# Call the function to construct input features
X_user, X_recommended, y_train = construct_double_tower_input(df, interest_labels, 100)

# Check the shape of the output
print("X_user shape:", X_user.shape)
print("X_recommended shape:", X_recommended.shape)
print("y_train shape:", y_train.shape)

X_user shape: (9900, 11)
X_recommended shape: (9900, 11)
y_train shape: (9900,)


In [5]:
# scale training data
X_user_unscaled = X_user
X_recommended_unscaled = X_recommended

# Standard scaling for X_recommended (formerly item_train)
scalerItem = StandardScaler()
scalerItem.fit(X_recommended)
X_recommended = scalerItem.transform(X_recommended)

# Standard scaling for X_user (formerly user_train)
scalerUser = StandardScaler()
scalerUser.fit(X_user)
X_user = scalerUser.transform(X_user)

# Now X_user, X_recommended, and y_train are scaled

In [6]:
from imblearn.over_sampling import SMOTE

# Combine the two inputs into one for oversampling purposes
combined_inputs = np.hstack((X_user, X_recommended))

# Apply SMOTE to the combined inputs and labels
smote = SMOTE()
combined_inputs_resampled, y_train_resampled = smote.fit_resample(combined_inputs, y_train)

# Split the resampled combined inputs back into X_user and X_recommended
X_user = combined_inputs_resampled[:, :X_user.shape[1]]
X_recommended = combined_inputs_resampled[:, X_user.shape[1]:]

In [9]:
np.sum(y_train)

7859

In [8]:
y_train = y_train_resampled.copy()
# Split X_recommended, X_user, and y_train into training and testing sets (80% train, 20% test)
X_recommended_train, X_recommended_test = train_test_split(X_recommended, train_size=0.80, shuffle=True)
X_user_train, X_user_test = train_test_split(X_user, train_size=0.80, shuffle=True)
y_train, y_test = train_test_split(y_train, train_size=0.80, shuffle=True)

# Print the shapes of the training and testing datasets
print(f"Recommended user (item) training data shape: {X_recommended_train.shape}")
print(f"Recommended user (item) test data shape: {X_recommended_test.shape}")
print(f"Target user training data shape: {X_user_train.shape}")
print(f"Target user test data shape: {X_user_test.shape}")
print(f"y_train training data shape: {y_train.shape}")
print(f"y_test test data shape: {y_test.shape}")

Recommended user (item) training data shape: (15680, 11)
Recommended user (item) test data shape: (3920, 11)
Target user training data shape: (15680, 11)
Target user test data shape: (3920, 11)
y_train training data shape: (15680,)
y_test test data shape: (3920,)


In [20]:
from tensorflow.keras.regularizers import l2

# Custom layer for L2 normalization
class L2NormalizationLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.math.l2_normalize(inputs, axis=1)

# Define the user and recommended user tower neural networks
num_outputs = 32  # Increased output dimension for concatenation later

user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(num_outputs, activation='linear')
])

recommended_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(num_outputs, activation='linear')
])

# Create the user input and connect it to the user tower
input_user = tf.keras.layers.Input(shape=(X_user.shape[1],))
vu = user_NN(input_user)
vu = L2NormalizationLayer()(vu)  # Normalize the output using the custom layer

# Create the recommended user input and connect it to the recommended user tower
input_recommended = tf.keras.layers.Input(shape=(X_recommended.shape[1],))
vr = recommended_NN(input_recommended)
vr = L2NormalizationLayer()(vr)  # Normalize the output using the custom layer

# Concatenate the two feature vectors from the user and recommended user towers
concatenated = tf.keras.layers.Concatenate()([vu, vr])

# Add additional dense layers after concatenation to learn more complex relationships
dense_combined = tf.keras.layers.Dense(64, activation='relu')(concatenated)
dense_combined = tf.keras.layers.Dense(32, activation='relu')(dense_combined)

# Output layer with sigmoid activation for binary classification
output = tf.keras.layers.Dense(1, activation='sigmoid')(dense_combined)

# Specify the inputs and output of the model
model = tf.keras.Model([input_user, input_recommended], output)

# Print the model summary
model.summary()

# Compile the model with additional evaluation metrics
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()])

In [21]:
# Define the cost function as Mean Squared Error (for regression problems) or binary crossentropy if it's a classification task
cost_fn = tf.keras.losses.BinaryCrossentropy()  # Assuming binary classification task

# Define the optimizer with a learning rate
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

# Compile the model with the defined optimizer and loss function
model.compile(optimizer=opt,
              loss=cost_fn,
              metrics=['accuracy'])  # Add accuracy as a metric

# Train the model using the user and recommended user features
# Make sure to pass the training data (X_user_train and X_recommended_train) along with the labels y_train
history = model.fit([X_user_train, X_recommended_train], y_train, epochs=50, batch_size=32)

Epoch 1/50
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.5018 - loss: 1.2059
Epoch 2/50
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.4960 - loss: 1.0593
Epoch 3/50
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5066 - loss: 0.9551
Epoch 4/50
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.4991 - loss: 0.8847
Epoch 5/50
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.5159 - loss: 0.8343
Epoch 6/50
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5139 - loss: 0.7978
Epoch 7/50
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5158 - loss: 0.7720
Epoch 8/50
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5168 - loss: 0.7529
Epoch 9/50
[1m490/490[0m [32m━━━━━━━━

KeyboardInterrupt: 

In [18]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate([X_user_test, X_recommended_test], y_test)

# Print the test results
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5106 - loss: 0.7857
Test Loss: 0.7894521951675415
Test Accuracy: 0.5068877339363098


In [13]:
y_test

array([0, 0, 0, ..., 0, 0, 0])

In [26]:
predictions = model.predict([X_user_test, X_recommended_test])

predicted_labels = (predictions > 0.29).astype(int)

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [27]:
from sklearn.metrics import confusion_matrix

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, predicted_labels)

# Display confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[1948    0]
 [  32    0]]
