Imports

In [110]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr
from helper import Helper
helper = Helper()

# Load the users and the recommendations from traditional aggregation methods
user_profiles = helper.load_data('results/user_profiles.pkl')
groups_exp_1 = helper.load_data('results/groups_exp_1.pkl')
groups_exp_2 = helper.load_data('results/groups_exp_2.pkl')
recommendations_exp_1 = helper.load_data('results/recommendations_exp_1.pkl')
recommendations_exp_2 = helper.load_data('results/recommendations_exp_2.pkl')


Functions

In [111]:
# Get the user profiles containing tag ratings of each group
def get_groups(groups_exp):
    groups = {}
    for g, group in enumerate(groups_exp):
        members = {}
        for user in group:
            for id in user.keys():
                members[id] = user_profiles[id]
        groups[g] = members
    return groups

# Create matrix of users and tag ratings for computing similarity between users
def create_matrix(groups):
    result = {}
    for group_id, group in groups.items():
        # Step 1: Collect all unique strings for the current group
        unique_strings = set()
        for ratings in group.values():
            unique_strings.update(ratings.keys())
        
        # Convert the set to a list to use as columns in DataFrame
        unique_strings = list(unique_strings)

        # Step 2: Create mapping of user ids to row indices
        users = list(group.keys())
        user_index = {user: idx for idx, user in enumerate(users)}

        # Step 3: Initialize an empty matrix
        num_users = len(users)
        num_strings = len(unique_strings)
        matrix = np.full((num_users, num_strings), np.nan)  # Using np.nan for missing ratings

        # Step 4: Populate the matrix
        string_index = {string: idx for idx, string in enumerate(unique_strings)}
        for member, ratings in group.items():
            for string, rating in ratings.items():
                row_idx = user_index[member]
                col_idx = string_index[string]
                matrix[row_idx, col_idx] = rating

        # Convert to pandas DataFrame for better readability
        df = pd.DataFrame(matrix, index=users, columns=unique_strings)
        
        # Store the DataFrame in the result dictionary
        result[group_id] = df
        
    return result

# Similarity functions
def cosine_similarity(matrix):
    num_users = matrix.shape[0]
    similarity_matrix = np.zeros((num_users, num_users))

    for i in range(num_users):
        for j in range(num_users):
            if i != j:
                similarity_matrix[i, j] = 1 - cosine(matrix[i], matrix[j])
            else:
                similarity_matrix[i, j] = 1.0  # Similarity with itself is 1

    return similarity_matrix

def pearson_similarity(matrix):
    num_users = matrix.shape[0]
    similarity_matrix = np.zeros((num_users, num_users))

    for i in range(num_users):
        for j in range(num_users):
            if i != j:
                valid_indices = ~np.isnan(matrix[i]) & ~np.isnan(matrix[j])
                if np.sum(valid_indices) > 0:
                    similarity_matrix[i, j], _ = pearsonr(matrix[i, valid_indices], matrix[j, valid_indices])
                else:
                    similarity_matrix[i, j] = 0  # If no valid ratings overlap, similarity is 0
            else:
                similarity_matrix[i, j] = 1.0  # Similarity with itself is 1

    return similarity_matrix

def compute_group_similarities(groups_matrix):
    similarity_results = {}

    for group_id, df in groups_matrix.items():
        matrix = df.to_numpy()

        # Compute cosine similarity
        cosine_sim = cosine_similarity(matrix)
        cosine_sim_df = pd.DataFrame(cosine_sim, index=df.index, columns=df.index)

        # Compute pearson similarity
        pearson_sim = pearson_similarity(matrix)
        pearson_sim_df = pd.DataFrame(pearson_sim, index=df.index, columns=df.index)

        similarity_results[group_id] = {
            'cosine_similarity': cosine_sim_df,
            'pearson_similarity': pearson_sim_df
        }

    return similarity_results

Analysis

In [112]:
groups = get_groups(groups_exp_1)
groups_matrix = create_matrix(groups)
group_similarities = compute_group_similarities(groups_matrix)

In [113]:
group_similarities

{0: {'cosine_similarity':         11176   52448   64625   37036   288146
  11176      1.0     1.0     1.0     1.0     1.0
  52448      1.0     1.0     1.0     1.0     1.0
  64625      1.0     1.0     1.0     1.0     1.0
  37036      1.0     1.0     1.0     1.0     1.0
  288146     1.0     1.0     1.0     1.0     1.0,
  'pearson_similarity':           11176     52448     64625     37036     288146
  11176   1.000000  0.058479  0.068283  0.085263 -0.059640
  52448   0.058479  1.000000 -0.035605  0.028293 -0.066133
  64625   0.068283 -0.035605  1.000000  0.000721  0.028975
  37036   0.085263  0.028293  0.000721  1.000000 -0.007267
  288146 -0.059640 -0.066133  0.028975 -0.007267  1.000000},
 1: {'cosine_similarity':         2310    962690  283390  129201  779699
  2310       1.0     1.0     1.0     1.0     1.0
  962690     1.0     1.0     1.0     1.0     1.0
  283390     1.0     1.0     1.0     1.0     1.0
  129201     1.0     1.0     1.0     1.0     1.0
  779699     1.0     1.0     1.0  