
The following Collaborative Filtering approach is based on <a href="https://pub.towardsai.net/recommendation-system-in-depth-tutorial-with-python-for-netflix-using-collaborative-filtering-533ff8a0e444">this</a> article.

In [1]:
import recommender_system
from sklearn.metrics import mean_squared_error
from torch.multiprocessing import Pool
import pandas as pd
import datetime
import sqlite3

### Utilities

In [2]:
def load_from_database(
        db: str,
        table: str,
        columns: str,
):
    connection = sqlite3.connect('../database/' + db + '.db')
    df = pd.read_sql_query(
        str('SELECT ' + columns + ' FROM ' + table), con=connection
    )
    connection.close()
    return df


def save_to_database(
        db: str,
        table: str,
        df: pd.DataFrame
):
    connection = sqlite3.connect('../database/' + db + '.db')
    try:
        df.to_sql(name=table, con=connection, if_exists='replace')
    except Exception as e:
        raise e
    print('DataFrame has been saved successfully to: ' + db)

### Inser Heading

In [3]:
df_en_train = load_from_database(db='review_en_sc_train', table='review_en_sc_train',
                                 columns='user_id, wine_id, note, rating, likes_count')
df_en_test = load_from_database(db='review_en_sc_test', table='review_en_sc_test',
                                columns='user_id, wine_id, note, rating, likes_count')

In [4]:
df = load_from_database(db='review_en_sentiment', table='review_en_sentiment',
                        columns='user_id, wine_id, note, rating, likes_count')

In [46]:
user = 56640092
rated_train = df_en_train[df_en_train['user_id'] == user]['wine_id']
rated_test = df_en_test[df_en_test['user_id'] == user]['wine_id']
df_temp = df_en_train[df_en_train['wine_id'].isin(rated_train)]
print(len(df_temp))
print(len(df_temp[df_temp['user_id'].isin(
    df_en_test[df_en_test['wine_id'].isin(rated_test)]['user_id']
)]))

6096
79


In [50]:
"""
        df_train: pd.DataFrame = args[0]
        df_test: pd.DataFrame = args[1]
        input_user_list = args[2]
        n_user: int = args[3]
        type_name: str = args[4]
        is_evaluation: bool = args[5]
        truncate: bool = args[6]
    """
start = datetime.datetime.now()
predictions = recommender_system.evaluate_recommender(
    [
        df_en_train,
        df_en_test,
        [56640092],
        5,
        'nlp',
        True,
        False,
        '../models/zero-shot'
    ]
)
print(
    str(mean_squared_error(y_pred=predictions['prediction'], y_true=predictions['rating'])), '\n',
    str(mean_squared_error(y_pred=predictions['prediction'], y_true=predictions['rating'], squared=False))
)
print(datetime.datetime.now() - start)

0.176 
 0.41952353926806063
0:00:01.019047


### Evaluate NLP Recommender

In [5]:
# Credits split method: https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length
def split(a, n):
    k, m = divmod(len(a), n)
    return list(a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

def start_nlp():
    df_intersection = list(set(df_en_test['user_id']) & set(df_en_train['user_id']))
    df_test = df_en_test[df_en_test['user_id'].isin(df_intersection)]
    df_train = df_en_train[df_en_train['user_id'].isin(df_intersection)]
    input_user_ids = df_intersection
    core_count = 4
    input_user_id_frames = split(input_user_ids, core_count)
    type = 'nlp'
    model = 'simcse_en_sc'
    model_path = '../models/' + model
    """
        df_train: pd.DataFrame = args[0]
        df_test: pd.DataFrame = args[1]
        input_user_list = args[2]
        n_user: int = args[3]
        type_name: str = args[4]
        is_evaluation: bool = args[5]
        truncate: bool = args[6]
    """
    data: list = list()
    for i in range(core_count):
        data.append([
            df_train,
            df_test,
            input_user_id_frames[i],
            5,
            type,
            True,
            False,
            model_path
        ])
    multi_pool = Pool(processes=core_count)
    start = datetime.datetime.now()
    predictions = multi_pool.map(recommender_system.evaluate_recommender, data)
    df_results = pd.concat(predictions)
    multi_pool.close()
    multi_pool.join()
    duration = datetime.datetime.now() - start

    mse: float = mean_squared_error(y_pred=df_results['prediction'], y_true=df_results['rating'])
    rmse: float = mean_squared_error(y_pred=df_results['prediction'], y_true=df_results['rating'], squared=False)
    con_rs_evaluation = sqlite3.connect('../database/rs_evaluation.db')
    if type == 'nlp':
        type = str(type + ' (' + model + ')')
    df = pd.DataFrame(
        {
            'date': [datetime.datetime.now()],
            'duration (in ns)': [duration],
            'mse': [mse],
            'rmse': [rmse],
            'type': [type],
        }
    )
    df.to_sql(name='rs_evaluation', con=con_rs_evaluation, if_exists='append')
    con_predictions = sqlite3.connect('../database/predictions.db')
    df_results.to_sql(name='nlp_simcse_en_sc', con=con_predictions, if_exists='replace')
    con_predictions.close()
    con_rs_evaluation.close()
    print(len(df_results))
    print(duration)


if __name__ == '__main__':
    start_nlp()

  df.to_sql(name='rs_evaluation', con=con_rs_evaluation, if_exists='append')


457044
11:35:50.588686


### Evaluate Reference Model

In [5]:
# Credits split method: https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length
def split(a, n):
    k, m = divmod(len(a), n)
    return list(a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))


def start_numeric():
    df_intersection = list(set(df_en_test['user_id']) & set(df_en_train['user_id']))
    df_test = df_en_test[df_en_test['user_id'].isin(df_intersection)]
    df_train = df_en_train[df_en_train['user_id'].isin(df_intersection)]
    input_user_ids = df_intersection
    core_count = 3
    input_user_id_frames = split(input_user_ids, core_count)

    # del df_en_train, df_en_test

    type_var = 'numeric'

    df_results = pd.DataFrame()
    """
        df_train: pd.DataFrame = args[0]
        df_test: pd.DataFrame = args[1]
        n_predictions: int = args[2]
        input_user_list = args[3]
    """
    start = datetime.datetime.now()
    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = [
            executor.submit(collaborative_filtering_numeric.evaluate_recommender, [
                df_test,  # 0
                df_train,  # 1
                20,  # 2
                input_user_id_frames[i],  # 3
            ]) for i in range(core_count)]

        for result in concurrent.futures.as_completed(results):
            df_results = pd.concat([df_results, result.result()])

        mse: float = mean_squared_error(y_pred=df_results['prediction'], y_true=df_results['rating'])
        rmse: float = mean_squared_error(y_pred=df_results['prediction'], y_true=df_results['rating'], squared=False)
        con_rs_evaluation = sqlite3.connect('../database/rs_evaluation.db')
        duration = datetime.datetime.now() - start
        df = pd.DataFrame(
            {
                'date': [datetime.datetime.now()],
                'duration (in ns)': [duration],
                'mse': [mse],
                'rmse': [rmse],
                'type': [type_var],
            }
        )
        df.to_sql(name='rs_evaluation', con=con_rs_evaluation, if_exists='append')
        print(duration)


if __name__ == '__main__':
    start_numeric()

0:00:07.966758
