The following Collaborative Filtering approach is based on <a href="https://pub.towardsai.net/recommendation-system-in-depth-tutorial-with-python-for-netflix-using-collaborative-filtering-533ff8a0e444">this</a> article.

In [1]:
import torch
import sqlite3
import datetime
import numpy as np
import pandas as pd
import thesis_utilities, collaborative_filtering_textual
from multiprocessing_scripts import recommender_nlp
from scipy import sparse
import cf_multiprocessing
import concurrent.futures
from sklearn.metrics import mean_squared_error
from sentence_transformers import SentenceTransformer, util

### Utilities

In [2]:
def load_from_database(
        db: str,
        table: str,
        columns: str,
):
    connection = sqlite3.connect('../database/' + db + '.db')
    df = pd.read_sql_query(
        str('SELECT ' + columns + ' FROM ' + table), con=connection
    )
    connection.close()
    return df


def save_to_database(
        db: str,
        table: str,
        df: pd.DataFrame
):
    connection = sqlite3.connect('../database/' + db + '.db')
    try:
        df.to_sql(name=table, con=connection, if_exists='replace')
    except Exception as e:
        raise e
    print('DataFrame has been saved successfully to: ' + db)

### Inser Heading

In [30]:
def get_mse_for_predictions(predictions: dict, test_reviews: pd.DataFrame, input_user: int):
    y_test = test_reviews[(test_reviews.user_id == input_user)]
    y_pred = pd.DataFrame(predictions.items(), columns=['wine_id', 'predicted_rating'])
    joined_df = y_test.merge(y_pred, how='left', on='wine_id').dropna()
    return mean_squared_error(y_true=joined_df.rating, y_pred=joined_df.predicted_rating)

In [31]:
def evaluate_collaborative_filtering(df_list: list):
    df_test = df_list[0]
    df_train = df_list[1]
    input_users = df_train[df_train.user_id.isin(df_test.user_id)].user_id.unique()
    sim_matrix = get_sim_matrix(train_rev_matrix)
    unpredictable_users = []
    df = pd.DataFrame(data={}.items(), columns=['wine_id', 'rating', 'rating_predicted'])

    for user in input_users:
        try:
            sim_users = get_top_n_similar_users(n=10, sim_matrix=sim_matrix, input_user=user)
            preds: dict = get_n_predictions(input_user=user, similar_users=sim_users, reviews=df_train,
                                            threshold=3.0, is_evaluation=True)
            df_temp = pd.DataFrame(preds.items(), columns=['wine_id', 'rating_predicted']).merge(
                df_test.loc[df_test.user_id == user, ['wine_id', 'rating']], on='wine_id', how='left')
            df = pd.concat([df, df_temp])
        except Exception as err:
            unpredictable_users.append(user)
            print('No wines could be predicted for user: ' + str(user) + ' (' + str(err) + ')')
            raise err
    return df

In [32]:
def run_multiprocessing():
    df_results = pd.DataFrame(data={}.items(), columns=['wine_id', 'rating', 'rating_predicted'])
    core_count = 10
    len_df_train = int(len(train_rev_df) / core_count)
    len_df_test = int(len(test_rev_df) / core_count)
    test_frames = [train_rev_df.iloc[i * len_df_train:(i + 1) * len_df_train].copy() for i in range(core_count + 1)]
    train_frames = [test_rev_df.iloc[i * len_df_test:(i + 1) * len_df_test].copy() for i in range(core_count + 1)]
    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = [
            executor.submit(cf_multiprocessing.evaluate_collaborative_filtering, [test_frames[i], train_frames[i]]) for
            i in range(core_count)]

        for result in concurrent.futures.as_completed(results):
            df_results = pd.concat([df_results, result.result()])
        print(df_results)
        print(mean_squared_error(y_true=df_results.rating, y_pred=df_results.rating_predicted))


if __name__ == '__main__':
    run_multiprocessing()

NameError: name 'train_rev_df' is not defined

### Prepare evaluation Datasets

In [None]:
def get_embedder(path: str):
    embedder = SentenceTransformer.load(path)
    if torch.has_mps:
        embedder.to('mps')
    return embedder

In [3]:
def get_n_similar_user(
        input_user_id: int,
        review_pool: pd.DataFrame,  # containing input_user's and other user's reviews
        embedder_path: str,
        n: int
) -> pd.DataFrame:
    user_list = []
    similarity_list = []
    embedder: SentenceTransformer = get_embedder(embedder_path)
    input_user_rated_wines = review_pool[review_pool['user_id'] == input_user_id]['wine_id'].unique()
    candidates = review_pool[
        (review_pool['user_id'] != input_user_id) &
        (review_pool['wine_id'].isin(input_user_rated_wines))
        ]['user_id'].unique()

    input_user_notes = review_pool[review_pool['user_id'] == input_user_id]['note'].tolist()
    input_user_embedding = embedder.encode(input_user_notes, convert_to_tensor=True)

    for candidate in candidates:
        candidate_notes: list = review_pool[review_pool['user_id'] == candidate]['note'].tolist()
        candidate_embedding = embedder.encode(candidate_notes, convert_to_tensor=True)
        similarity = util.cos_sim(a=input_user_embedding, b=candidate_embedding).mean()
        user_list.append(candidate)
        similarity_list.append(np.round(float(similarity.mean()), decimals=6))
    similar_user = pd.DataFrame(
        {'user_id': user_list, 'similarity': similarity_list}
    )
    similar_user.sort_values(by='similarity', ascending=False, inplace=True)
    return similar_user.head(n=n)

In [4]:
def get_predictions(
        review_pool: pd.DataFrame,
        input_user_id: int,
        similar_user_id: int,
        target_wines: list = None
) -> pd.DataFrame:
    input_user_avg_rating = np.round(review_pool[review_pool['user_id'] == input_user_id]['rating'].mean(), decimals=1)
    predictions_list = list()
    wines_list = list()
    input_user_list = list()

    if target_wines is None:
        target_wines: list = review_pool[
            (review_pool['user_id'] == similar_user_id) &
            (review_pool['user_id'] != input_user_id)
            ]['wine_id'].tolist()

    sim_user_avg_rating = np.round(review_pool[review_pool['user_id'] == similar_user_id]['rating'].mean(), decimals=1)
    for target_wine in target_wines:
        sim_user_rating: float = review_pool[
            (review_pool['user_id'] == similar_user_id) & (review_pool['wine_id'] == target_wine)]['rating'].values[
            0]
        prediction: float = np.round(
            input_user_avg_rating + (sim_user_rating - sim_user_avg_rating), decimals=1)
        predictions_list.append(prediction)
        wines_list.append(target_wine)
        input_user_list.append(input_user_id)
    return pd.DataFrame({'user_id': input_user_list, 'wine_id': wines_list, 'prediction': predictions_list})

In [5]:
def evaluate_recommender(
        df_train: pd.DataFrame,
        df_test: pd.DataFrame,
        type: str,
        n_predictions: int,
        model_path: str = '../models/zero-shot',
        input_user_list=None,
        is_evaluation: bool = None
):
    df_results = pd.DataFrame()

    if input_user_list is None:
        input_user_list = df_train['user_id']

    if type == 'nlp':
        print('Selected model path: ', model_path)
        for input_user_id in input_user_list:
            input_user_rated_wines: list = df_train[df_train['user_id'] == input_user_id]['wine_id'].tolist()
            input_user_rated_wines_test: list = df_test[df_test['user_id'] == input_user_id]['wine_id'].tolist()
            if is_evaluation:
                # Ensure, user can be evaluated against input user.
                review_pool: pd.DataFrame = df_train[
                    df_train['wine_id'].isin(input_user_rated_wines)
                ]
                # Reduce to input-user-reviews in training data.
                evaluation_ready_user = df_test[df_test['wine_id'].isin(input_user_rated_wines_test)]['user_id'].unique().tolist()
                review_pool = review_pool[review_pool['user_id'].isin(evaluation_ready_user)]
            else:
                review_pool = df_train[
                    df_train['wine_id'].isin(input_user_rated_wines)
                ]
            similar_user: pd.DataFrame = collaborative_filtering_textual.get_n_similar_user(
                input_user_id=input_user_id, review_pool=review_pool,
                embedder_path=model_path, n=n_predictions
            )
            for similar_user_id in similar_user['user_id']:
                target_wines = list(set(df_test[df_test['user_id'] == similar_user_id]['wine_id']) &
                                    set(input_user_rated_wines_test))
                df_predictions = get_predictions(
                    review_pool=pd.concat([df_train, df_test]), input_user_id=input_user_id,
                    similar_user_id=similar_user_id, target_wines=target_wines,
                )
                # print('df_predictions: ', str(df_predictions))
                df_results = pd.concat([df_results, df_predictions])
    elif type == 'rating':
        pass
    else:
        raise ValueError('Please set a valid recommender type (nlp/rating)!')
    df_results = df_results.merge(df_test[['user_id', 'wine_id', 'rating']], on=['user_id', 'wine_id'])
    mse: float = mean_squared_error(y_pred=df_results['prediction'], y_true=df_results['rating'])
    print(mse)
    return mse

In [6]:
df_en_train = load_from_database(db='review_en_train', table='review_en_train',
                                 columns='user_id, wine_id, note, rating')
df_en_test = load_from_database(db='review_en_test', table='review_en_test',
                                columns='user_id, wine_id, note, rating')

In [12]:
start = datetime.datetime.now()
evaluate_recommender(
    df_train=df_en_train, df_test=df_en_test, type='nlp', n_predictions=20, model_path='../models/simcse_en',
    input_user_list=[27493713], is_evaluation=True)
print(datetime.datetime.now() - start)

Selected model path:  ../models/simcse_en
0.5405
0:00:01.358050
