In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from ast import literal_eval
import time
from datetime import timedelta
import pickle

from scipy.sparse import hstack, vstack, csr_matrix, load_npz, save_npz
from scipy.stats import pearsonr

from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.width', 1400)

#### Functions

In [2]:
def get_train_and_test_data(df):
    from sklearn.model_selection import train_test_split
    df_exploded = df.explode(['rated_recipes', 'rating_list'])
    df_train_exploded, df_test_exploded = train_test_split(df_exploded, test_size=0.2, random_state=42)
    df_train = df_train_exploded.groupby(level=0).agg(list).drop('ingredients', axis=1)
    df_test = df_test_exploded.groupby(level=0).agg(list).drop('ingredients', axis=1)
    all_user_ids = df.index
    df_train = df_train.reindex(all_user_ids, fill_value=[]) # Re-index to ensure all user_ids are included
    df_test = df_test.reindex(all_user_ids, fill_value=[])
    return df_train, df_test

#### Load Data

In [3]:
# Create or load training and test data
converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
train_fn = 'dataset/User_Data_Train.csv'
test_fn = 'dataset/User_Data_Test.csv'
if os.path.exists(train_fn):
    print('Loading train and test userdata ...')
    df_train = pd.read_csv(train_fn, converters=converters, index_col='user_id')
    df_test =  pd.read_csv(test_fn, converters=converters, index_col='user_id')
else:
    print('Reading userdata dataframe ...')
    df_userdata = pd.read_csv('dataset/User_Data.csv', converters=converters, index_col='user_id')
    print('Splitting userdata into training and test data ...')
    df_train, df_test = get_train_and_test_data(df_userdata.head(None).copy())
    df_train.to_csv(train_fn)
    df_test.to_csv(test_fn)
print('Done.')

Loading train and test userdata ...
Done.


#### Correlation Calculations

In [4]:
def get_correlation_coefficient(a_recipes: np.ndarray, a_ratings: dict, a_mean_rating: int, b_recipes: np.ndarray, b_ratings: dict, b_mean_rating: int):
    common_recipes = [ x for x in np.concatenate((a_recipes, b_recipes)) if (x in a_recipes and x in b_recipes) ]
    if common_recipes == []:
        return np.nan
    a_common_ratings = np.array([ a_ratings[x] for x in common_recipes ])
    b_common_ratings = np.array([ b_ratings[x] for x in common_recipes ])
    diffa = a_common_ratings - a_mean_rating
    diffb = b_common_ratings - b_mean_rating
    numerator = np.sum( diffa * diffb )
    holda = np.sum(diffa**2)
    holdb = np.sum(diffb**2)
    if holda == 0 or holdb == 0:
        return np.nan
    corr = numerator / np.sqrt(holda * holdb)
    return corr

In [5]:
# Prepare data for correlation calculations
recipe_arrays, ratings_dicts, mean_ratings = [], [], []
n_rows = len(df_train)
for i, (user_id, row) in enumerate(df_train.iterrows()):
    print('\r({:_}/{:_})'.format(i+1, n_rows), end='')
    recipes = np.array(df_train.iloc[i]['rated_recipes'])
    ratings = df_train.iloc[i]['rating_list']
    mean_rating = np.mean(ratings)
    ratings_map = { id_: rating for id_, rating in zip(recipes, ratings) }
    recipe_arrays.append(recipes)
    mean_ratings.append(mean_rating)
    ratings_dicts.append(ratings_map)
print('\nDone.')

(1_940/226_570)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


(226_570/226_570)
Done.


In [12]:
# Get correlation coefficient for all users
def get_correlations_for_user(target_i):
    user_corrs = []
    n_rows = len(recipe_arrays)
    start = time.time()
    for index, J in enumerate(range(n_rows)):
        if index%500 == 0:
            print('\rGetting correlation coefficient for ({:_}/{:_}) '.format(index+1, n_rows), end='')
        corr = get_correlation_coefficient(recipe_arrays[target_i], ratings_dicts[target_i], mean_ratings[target_i], recipe_arrays[J], ratings_dicts[J], mean_ratings[J])
        user_corrs.append(corr)
    print(' done. Took {}'.format( time.strftime("%M:%S", time.gmtime(time.time()-start)) ))
    return user_corrs

In [7]:
# Get most similar users to user index using CC
def get_most_similar_CC(i, top_n=100):
    corrs = get_correlations_for_user(i)
    corrs_items = [ (idx, corr) for idx, corr in enumerate(corrs) if (corr != None and not np.isnan(corr)) ]
    corrs_items.sort(reverse=True, key=lambda x: x[1])
    return corrs_items[1:top_n+1]

In [8]:
def get_recommendations_collabFilter(user_i, user_rated_recipes, top_n_users=100, top_n_recipes=100):
    corrs_items = get_most_similar_CC(user_i, top_n=top_n_users)
    print('Got {} corrs_items'.format(len(corrs_items)))
    similar_user_ratings = []
    for user_idx, _ in corrs_items:
        similar_user_ratings.extend([ (recipe_id, rating) for recipe_id, rating in ratings_dicts[user_idx].items() if (recipe_id not in user_rated_recipes) ])
    similar_user_ratings.sort(reverse=True, key=lambda x: x[1])
    recommend = [ id_ for id_, _ in similar_user_ratings ][:top_n_recipes]
    return recommend

In [9]:
def generate_user_recommendations_collabFilter(index, save_name='dataset/recommendations_collabFilter.pkl', handle_limit=None, save_period=1, redo=False):
    recommendations = {}
    if os.path.exists(save_name):
        with open(save_name, 'rb') as f:
            recommendations = pickle.load(f)
    handled = 0
    for i, user_id in enumerate(index):
        if redo or user_id not in recommendations:
            print('\n({:_}/{:_}) Getting recommendations ({:_} handled)'.format(i+1, len(index), handled))
            user_rated_recipes = recipe_arrays[i]
            recommend = get_recommendations_collabFilter(i, user_rated_recipes, top_n_recipes=1000)
            print('Generated {} recommendations: {}'.format(len(recommend), recommend[:5]))
            recommendations[user_id] = recommend
            if save_period and i%save_period==0:
                with open(save_name, 'wb') as f:
                    pickle.dump(recommendations, f)
            handled += 1
            if handle_limit and handled >= handle_limit:
                break
    with open(save_name, 'wb') as f:
        pickle.dump(recommendations, f)
    print('Done generating recommendations.')

In [10]:
def load_user_recommendations_collabFilter(save_name='dataset/recommendations_collabFilter.pkl'):
    with open(save_name, 'rb') as f:
        recommendations = pickle.load(f)
    return recommendations

In [13]:
# Generate user recommendations
try:
    generate_user_recommendations_collabFilter(df_train.index, handle_limit=None)
except KeyboardInterrupt:
    print('\nKeyboardInterrupt detected')


(29/226_570) Getting recommendations (0 handled)
Getting correlation coefficient for (226_501/226_570)  done. Took 00:12
Got 28 corrs_items
Generated 1000 recommendations: [80871, 116402, 94597, 133090, 283098]

(30/226_570) Getting recommendations (1 handled)
Getting correlation coefficient for (226_501/226_570)  done. Took 00:05
Got 0 corrs_items
Generated 0 recommendations: []

(31/226_570) Getting recommendations (2 handled)
Getting correlation coefficient for (226_501/226_570)  done. Took 00:03
Got 0 corrs_items
Generated 0 recommendations: []

(32/226_570) Getting recommendations (3 handled)
Getting correlation coefficient for (226_501/226_570)  done. Took 00:05
Got 0 corrs_items
Generated 0 recommendations: []

(33/226_570) Getting recommendations (4 handled)
Getting correlation coefficient for (226_501/226_570)  done. Took 00:12
Got 45 corrs_items
Generated 1000 recommendations: [54099, 162657, 174017, 301649, 317083]

(34/226_570) Getting recommendations (5 handled)
Getting c

#### Evaluation

In [14]:

recommendations = load_user_recommendations_collabFilter()
len(recommendations)

1316

#### RUBBISH!

In [57]:
import random
n_cols, n_rows = 250_000, 250_000
# density = 50/n_cols
cols_i, rows_i, ratings = [], [], []
# if random.random() < 0.9:
print('Generating data ...')
for i in range(n_rows):
    for _ in range(random.randint(1, 99)):
        cols_i.append(random.randint(1, n_cols)-1)
        rows_i.append(i)
        ratings.append(random.randint(0,5))
len(cols_i)

Generating data ...


12497674

In [58]:
print('Making csr matrix ...')
matrix = csr_matrix((ratings, (rows_i, cols_i)), shape=(n_rows, n_cols))
print(type(matrix))

Making csr matrix ...
<class 'scipy.sparse._csr.csr_matrix'>


In [None]:
# 
sims = cosine_similarity(matrix[0], matrix)[0]
sims_items = sorted([ (i, sim) for i, sim in enumerate(sims) ], reverse=True, key=lambda item: item[1])

In [None]:
sims_items[:10]

In [65]:
def pearson_cc_sparce(v1, v2):
    # Calculate means, converting sparse vectors to dense for mean calculation
    mean1 = v1.mean()
    mean2 = v2.mean()
    
    # Center the sparse vectors by subtracting the mean
    # diff1 = v1 - mean1  # This will give the error as sparse does not support scalar subtraction
    # diff2 = v2 - mean2

    # Convert diff1 and diff2 to sparse-centered versions
    # Workaround: Subtract mean manually for nonzero elements only
    diff1 = v1.copy()  # Use copy to avoid modifying the original sparse arrays
    diff1.data -= mean1

    diff2 = v2.copy()
    diff2.data -= mean2

    # Compute covariance as the dot product of the centered vectors
    numerator = diff1.multiply(diff2).sum()

    # Compute standard deviations
    std1 = np.sqrt(diff1.multiply(diff1).sum())
    std2 = np.sqrt(diff2.multiply(diff2).sum())

    # Handle zero standard deviations to avoid division by zero
    if std1 == 0 or std2 == 0:
        return np.nan

    # Pearson correlation coefficient
    corr = numerator / (std1 * std2)
    return corr

In [63]:
def pearson_correlation(user1_ratings, user2_ratings):
    # Get the indices of non-NaN values in both users' ratings
    common_indices = (~np.isnan(user1_ratings)) & (~np.isnan(user2_ratings))
    
    # Extract the common ratings
    ratings_user1 = user1_ratings[common_indices]
    ratings_user2 = user2_ratings[common_indices]
    
    # Check if there are enough common ratings to calculate correlation
    if len(ratings_user1) < 2:
        return np.nan  # Not enough data to calculate correlation
    
    # Calculate Pearson correlation coefficient
    mean_user1 = np.mean(ratings_user1)
    mean_user2 = np.mean(ratings_user2)
    
    numerator = np.sum((ratings_user1 - mean_user1) * (ratings_user2 - mean_user2))
    denominator = np.sqrt(np.sum((ratings_user1 - mean_user1) ** 2)) * np.sqrt(np.sum((ratings_user2 - mean_user2) ** 2))
    
    if denominator == 0:
        return np.nan  # Avoid division by zero
    
    return numerator / denominator

In [66]:
vector_i = 0
target_vector = matrix[vector_i].toarray().ravel()
corrs = []
n_vectors = matrix.shape[0]
for i, vector in enumerate(matrix):
    print('\r({}/{})'.format(i+1, n_vectors), end='')
    # corr, pvalue = pearsonr(target_vector, vector.toarray().ravel())
    corr = pearson_cc_sparce(target_vector, vector.toarray().ravel())
    # corr = pearson_correlation(target_vector, vector.toarray().ravel())
    corrs.append(corr)
    # if i > 1000: break
print()
len(corrs)

(1/250000)

  diff1.data -= mean1
  diff2.data -= mean2


AttributeError: 'numpy.ndarray' object has no attribute 'multiply'

In [None]:
import numpy as np
from scipy import sparse

def sparse_corrcoef(A, B=None):

    if B is not None:
        A = sparse.vstack((A, B), format='csr')

    A = A.astype(np.float64)
    n = A.shape[1]

    # Compute the covariance matrix
    rowsum = A.sum(1)
    centering = rowsum.dot(rowsum.T.conjugate()) / n
    C = (A.dot(A.T.conjugate()) - centering) / (n - 1)

    # The correlation coefficients are given by
    # C_{i,j} / sqrt(C_{i} * C_{j})
    d = np.diag(C)
    coeffs = C / np.sqrt(np.outer(d, d))

    return coeffs

In [None]:
# some smallish sparse random matrices
a = sparse.rand(1, 250_000, density=0.5, format='csr')
b = sparse.rand(10_000, 250_000, density=0.5, format='csr')

a.shape

coeffs1 = sparse_corrcoef(a, b)
coeffs1.shape
# coeffs1
# coeffs2 = np.corrcoef(a.todense(), b.todense())

# print(np.allclose(coeffs1, coeffs2))
# True

In [None]:
vector_i = 0
target_vector = matrix[vector_i]
corrs = []
n_vectors = matrix.shape[0]
for i, vector in enumerate(matrix):
    print('\r({}/{})'.format(i+1, n_vectors), end='')
    coeffs = sparse_corrcoef(target_vector, vector)
    corr = coeffs[0, 1]
    corrs.append(corr)
    if i > 10_000: break
print()
len(corrs)

In [None]:
coeffs = sparse_corrcoef(matrix[0], matrix)
coeffs.shape