In [16]:

def recommend(model, user_id, business_ids, top_n=10):
    """
    Recommend the top N businesses for a given user based on predicted ratings.
    
    Parameters:
    model: A model that takes (user_id, business_id) as input and predicts a rating between 1 and 5.
    user_id: The ID of the user for whom recommendations are being generated.
    business_ids: A list of business IDs to consider for recommendation.
    top_n: The number of top recommendations to return (default is 10).
    
    Returns:
    A sorted list of (business_id, predicted_rating) tuples, in descending order of predicted rating.
    """
    predictions = [(business_id, model.predict(user_id, business_id)) for business_id in business_ids]
    
    # Sort businesses by predicted rating in descending order and return the top_n
    recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_n]
    
    return recommendations


def evaluate(model, test_df, threshold=3, top_n=10):
    """
    Evaluate the recommendation model using precision and recall.
    
    Parameters:
    model: A model that predicts ratings for (user_id, business_id) pairs.
    test_df: A DataFrame containing columns ['user_id', 'business_id', 'rating'].
    threshold: The minimum rating to consider a business as relevant (default is 3).
    top_n: The number of top recommendations to consider per user (default is 10).
    
    Returns:
    A tuple (recall, precision) for the model's recommendations.
    """
    total_relevant = 0
    total_recommended = 0
    total_matched = 0
    
    users = test_df['user_id'].unique()
    
    for user_id in users:
        user_data = test_df[test_df['user_id'] == user_id]
        actual_relevant = set(user_data[user_data['rating'] >= threshold]['business_id'])
        
        recommended = set(business_id for business_id, _ in recommend(model, user_id, user_data['business_id'].tolist(), top_n))
        
        total_relevant += len(actual_relevant)
        total_recommended += len(recommended)
        total_matched += len(actual_relevant & recommended)
        #this is to watch the progressive metrics if th eevaluation is taking long.
        # print(total_matched / total_relevant) if total_relevant > 0 else 0
        # print(total_matched / total_recommended) if total_recommended > 0 else 0
    
    recall = total_matched / total_relevant if total_relevant > 0 else 0
    precision = total_matched / total_recommended if total_recommended > 0 else 0
    
    return recall, precision

In [17]:
import sys
import os

# Get the current working directory (where the notebook is running)
BASE_DIR = os.getcwd()

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

In [18]:
import pickle
import pandas as pd
from model import Model

# Load the CSV
df_business = pd.read_csv('business_transformed.csv')

# Take 30 business_id
business_ids = df_business['business_id'].head(30).tolist()

# Load the best model
pretrained_model = pickle.load(open('../training/saved_models/model.pkl', 'rb'))

# Initialize it
model=Model(pretrained_model)

rec_list=recommend(model, "zJ4MM5h-0Mb4VvO-WEskJg", business_ids, top_n=10)

rec_list

[('WYYdQDjx-DsCanlP0DpImQ', 4.121147155761719),
 ('BxR5hjyBnJZrm_nnaCtGQw', 4.049802303314209),
 ('NQSnr4RPUScss607oxOaqw', 4.033130645751953),
 ('e8pMCk4bE0U4qk7z59Lxkg', 4.020570755004883),
 ('n8ecak12IF_jhnPNs37AZA', 3.983497142791748),
 ('kq5Ghhh14r-eCxlVmlyd8w', 3.6866531372070312),
 ('VCkSUsdL5P0p16DAPxVROA', 3.666249990463257),
 ('MTSW4McQd7CbVtyjqoe9mw', 3.6572084426879883),
 ('FTFtoylMzkrzqNVSl-bkKw', 3.652892589569092),
 ('JomDoQafikclSVVCPkWjnA', 3.6328701972961426)]

In [19]:
# Load the CSV
df_reviews = pd.read_csv('../reviews_with_topics.csv')

# Select 30 random rows with the desired columns
test_df = df_reviews[['user_id', 'business_id', 'stars_review']].sample(30, random_state=42)
test_df= test_df.rename(columns={'stars_review': 'rating'})

test_df

Unnamed: 0,user_id,business_id,rating
50112,SGvsin5Fw5tk3F0mx_QEDA,by2wRASwOKpCZU9KtZ3MHA,3.0
69508,Cf8Jw9yX8RrG-l4PQoMkuw,2CDI713ATuxHfnB5b-sBdw,5.0
50796,b_6RAjLot-sGZit_fwOAzg,LEyWxCngS9m5hWc9coJD4g,4.0
82332,mzL0zHFGyd9866OUljq49g,mBgaPljP3OYkl_vGKTyFNw,4.0
61736,NLpjUuO5uN-qKKmait8Nzg,EtKSTHV5Qx_Q7Aur9o4kQQ,4.0
40808,WfePuaS_iXGNGU6bfDe3JQ,SGi8AJhR8iS-LsQwcMOAVA,5.0
14247,7NyerV7a3CsvYC20iSfLHg,zfn7V7FVH5_J5A9dInfbnA,2.0
6888,61yZ26RhF0g_bdumzD0yWg,Tk9KD_DDpcMeceID_VrutQ,5.0
30067,IH0ToaZ8hJXO2pVieN7dpQ,jlOPXMrsxv9gq0y3BqZgmQ,4.0
75497,KW-X4cQ2yBZYVUjFaAY04A,99e7bysta1myyrQogFEWUQ,4.0


In [20]:
results=evaluate(model, test_df, threshold=3, top_n=10)
results

(1.0, 0.9666666666666667)