The following Collaborative Filtering approach is based on <a href="https://pub.towardsai.net/recommendation-system-in-depth-tutorial-with-python-for-netflix-using-collaborative-filtering-533ff8a0e444">this</a> article.

In [None]:
import sqlite3
import pandas as pd
import numpy as np
from scipy import sparse
import cf_multiprocessing
import concurrent.futures
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
con_rev = sqlite3.connect('../database/review.db')
reviews_df = pd.read_sql_query("SELECT wine_id, rating, user_id from review", con_rev)
reviews_df.head()

In [None]:
# user_50_revs = reviews_df.groupby(by='user_id').filter(lambda x: x['user_id'].count() >= 50)
user_100_revs = reviews_df.groupby(by='user_id').filter(lambda x: x['user_id'].count() >= 100)
# user_200_revs = reviews_df.groupby(by='user_id').filter(lambda x: x['user_id'].count() >= 200)
# user_500_revs = reviews_df.groupby(by='user_id').filter(lambda x: x['user_id'].count() >= 500)

In [None]:
wine_count_total = np.count_nonzero(np.unique(reviews_df.wine_id))
# wine_count_50 = np.count_nonzero(np.unique(user_50_revs.wine_id))
wine_count_100 = np.count_nonzero(np.unique(user_100_revs.wine_id))
# wine_count_200 = np.count_nonzero(np.unique(user_200_revs.wine_id))
# wine_count_500 = np.count_nonzero(np.unique(user_500_revs.wine_id))

In [None]:
print(
    'Set with users more than 50 reviews contains: ' + str(
        np.round(wine_count_50 / wine_count_total, decimals=4) * 100) + '% of all wines' + '\n' +
    'Set with users more than 100 reviews contains: ' + str(
        np.round(wine_count_100 / wine_count_total, decimals=4) * 100) + '% of all wines' + '\n' +
    'Set with users more than 200 reviews contains: ' + str(
        np.round(wine_count_200 / wine_count_total, decimals=4) * 100) + '% of all wines' + '\n' +
    'Set with users more than 500 reviews contains: ' + str(
        np.round(wine_count_500 / wine_count_total, decimals=2) * 100) + '% of all wines' + '\n'
)

In [None]:
def get_sparse_wine_user_matrix(df: pd.DataFrame):
    return sparse.csr_matrix(
        (df.rating, (df.user_id, df.wine_id))
    )

In [None]:
def get_sim_matrix(matrix: sparse.csr_matrix):
    return cosine_similarity(matrix, dense_output=False)

In [None]:
def get_top_n_similar_users(n: int, sim_matrix: sparse.csr_matrix, input_user: int):
    users = sim_matrix[input_user, :].nonzero()[1]
    users = np.delete(users, np.where(users == input_user)[0])
    similar_users = {
        user: sim_matrix[input_user, user] for user in users
    }
    similar_users = {
        k: v for k, v in sorted(similar_users.items(), key=lambda item: item[1])
    }
    return list(similar_users.keys())[-n:][::-1]

In [None]:
def get_n_predictions(input_user: int, similar_users: list, reviews: pd.DataFrame, threshold: float,
                      is_evaluation: bool):
    input_user_avg_rating = np.round(reviews[reviews.user_id == input_user].rating.mean(), decimals=1)
    input_user_rated_wines = np.unique(reviews[reviews.user_id == input_user].wine_id)
    wine_prediction = {}
    for sim_user in similar_users:
        user_avg_rating = reviews[reviews.user_id == sim_user].rating.mean()
        if is_evaluation:
            unrated_wines: pd.DataFrame = reviews_df.loc[
                (reviews_df.user_id == sim_user) & (~reviews_df.wine_id.isin(input_user_rated_wines)) & (
                    reviews_df.wine_id.isin(test_rev_df.loc[test_rev_df.user_id == input_user].wine_id)),
                ['user_id', 'wine_id', 'rating']
            ]
        else:
            unrated_wines: pd.DataFrame = reviews_df.loc[
                (reviews_df.user_id == sim_user) & (~reviews_df.wine_id.isin(input_user_rated_wines)), ['user_id',
                                                                                                        'wine_id',
                                                                                                        'rating']
            ]
        for wine in unrated_wines.wine_id:
            if wine not in wine_prediction:
                wine_prediction[wine] = round((input_user_avg_rating + (
                        unrated_wines.loc[unrated_wines.wine_id == wine].rating.iloc[0] - user_avg_rating)) * 2) / 2
    return {key: val for key, val in wine_prediction.items() if val >= threshold}

In [None]:
def get_mse_for_predictions(predictions: dict, test_reviews: pd.DataFrame, input_user: int):
    y_test = test_reviews[(test_reviews.user_id == input_user)]
    y_pred = pd.DataFrame(predictions.items(), columns=['wine_id', 'predicted_rating'])
    joined_df = y_test.merge(y_pred, how='left', on='wine_id').dropna()
    return mean_squared_error(y_true=joined_df.rating, y_pred=joined_df.predicted_rating)

In [None]:
train_rev_df, test_rev_df = train_test_split(reviews_df[reviews_df.user_id.isin(user_100_revs.user_id.unique())],
                                             test_size=0.3, random_state=26)

train_rev_matrix = get_sparse_wine_user_matrix(train_rev_df)
test_rev_matrix = get_sparse_wine_user_matrix(test_rev_df)

In [None]:
len(train_rev_df) + len(test_rev_df)

In [None]:
def evaluate_collaborative_filtering(df_list: list):
    df_test = df_list[0]
    df_train = df_list[1]
    input_users = df_train[df_train.user_id.isin(df_test.user_id)].user_id.unique()
    sim_matrix = get_sim_matrix(train_rev_matrix)
    unpredictable_users = []
    df = pd.DataFrame(data={}.items(), columns=['wine_id', 'rating', 'rating_predicted'])

    for user in input_users:
        try:
            sim_users = get_top_n_similar_users(n=10, sim_matrix=sim_matrix, input_user=user)
            preds: dict = get_n_predictions(input_user=user, similar_users=sim_users, reviews=df_train,
                                            threshold=3.0, is_evaluation=True)
            df_temp = pd.DataFrame(preds.items(), columns=['wine_id', 'rating_predicted']).merge(
                df_test.loc[df_test.user_id == user, ['wine_id', 'rating']], on='wine_id', how='left')
            df = pd.concat([df, df_temp])
        except Exception as err:
            unpredictable_users.append(user)
            print('No wines could be predicted for user: ' + str(user) + ' (' + str(err) + ')')
            raise err
    return df

In [None]:
def run_multiprocessing():
    df_results = pd.DataFrame(data={}.items(), columns=['wine_id', 'rating', 'rating_predicted'])
    core_count = 9
    len_df_train = int(len(train_rev_df)/core_count)
    len_df_test = int(len(test_rev_df)/core_count)
    test_frames = [ train_rev_df.iloc[i*len_df_train:(i+1)*len_df_train].copy() for i in range(core_count+1) ]
    train_frames = [ test_rev_df.iloc[i*len_df_test:(i+1)*len_df_test].copy() for i in range(core_count+1) ]
    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = [executor.submit(cf_multiprocessing.evaluate_collaborative_filtering, [test_frames[i], train_frames[i]]) for i in range(core_count)]

        for result in concurrent.futures.as_completed(results):
            df_results = pd.concat([df_results, result.result()])
        print(df_results)
        print(mean_squared_error(y_true=df_results.rating, y_pred=df_results.rating_predicted))

if __name__ == '__main__':
    run_multiprocessing()

In [None]:
df_result = evaluate_collaborative_filtering([test_rev_df, train_rev_df])

In [None]:
mean_squared_error(y_true=df_result.rating, y_pred=df_result.rating_predicted)