# Recommendation Engine

### data: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

In [91]:
# load package
import pandas as pd
import numpy as np

In [92]:
# load data
df_movie = pd.read_csv("data/movies_metadata.csv")
df_ratings = pd.read_csv("data/ratings_small.csv")
pd.set_option('display.max_columns', None)
df_movie['movieId'] = pd.to_numeric(df_movie['id'], errors='coerce')
df = pd.merge(df_ratings, df_movie, on='movieId', how='inner')
df_re = df.iloc[:, 0:3]


  df_movie = pd.read_csv("data/movies_metadata.csv")


## Building the Recommendation Engine

### User based recommendation engine


In [93]:
# build rating matrix
ratings = df_re.groupby(["userId","movieId"])["rating"].mean().unstack().reset_index().set_index('userId')
rating_norm = ratings.subtract(ratings.mean(axis=1), axis = 0)
user_similarity = rating_norm.T.corr(method='pearson')


In [94]:
def UBRE(target_user, product_id):
    # find users with similar ratings
    target_user_similarity = user_similarity.loc[[target_user]].dropna(axis=1, how='all').reset_index()
    target_user_similarity.drop(target_user_similarity.iloc[:, 0:1], inplace=True, axis=1)
    target_user_similarity = target_user_similarity.T
    target_user_similarity.columns = ["similarity to the targeted customer"]
    target_user_similarity.reset_index()
    target_user_similarity.drop(target_user, axis=0, inplace=True)
    # get rating to target product based for similar users
    target_item_ratings = rating_norm.loc[:,product_id].dropna(axis=0, how='all').reset_index()
    target_item_ratings.columns = ["userId","rating to the targeted product"]
    ratings_with_user_sim = target_item_ratings.merge(target_user_similarity, on = "userId", how = "inner")
    top5_users = ratings_with_user_sim.sort_values(by = "similarity to the targeted customer", ascending = False).iloc[0:5,]
    try:
        predicted_rating = round(np.average(top5_users['rating to the targeted product'],
                                        weights=top5_users['similarity to the targeted customer']), 2)
    except:
        return 0
    return predicted_rating



For example, if we are predicting the recommended item for user 1, the recommendation would be:

In [95]:
target_user = 1
top_num_recommend = 5
item_rank = {}
for pid in df_re.movieId.unique():
    item_rank[pid] = UBRE(target_user, pid)

In [96]:
# show recommended movies
res = set()
for i, v in sorted(item_rank.items(), key=lambda item: item[1], reverse=True):
    cur_title = df[df['movieId'] == i].original_title.unique()[0]
    res.add(cur_title)
    if len(res) == top_num_recommend:
        break
print("The top recommendation movies are (showed in ID) are: ")
print(res)

The top recommendation movies are (showed in ID) are: 
{'Mermaids', "À l'aveugle", 'Freedom Writers', 'Дневной дозор', 'Dans ma peau'}


### Item based recommendation engine

In [97]:
# Product similarity matrix using Pearson correlation
item_similarity = rating_norm.corr(method='pearson')


In [98]:
def IBRE(target_user, product_id):
    target_item_similarity_score = item_similarity.loc[[product_id]].dropna(axis=1, how='all').reset_index()
    target_item_similarity_score.drop(target_item_similarity_score.iloc[:,0:1], inplace= True,axis = 1)
    target_item_similarity_score = target_item_similarity_score.T.reset_index()
    target_item_similarity_score.columns = ["ProductId","similarity to the targeted product"]
    target_item_similarity_score.drop(target_item_similarity_score[target_item_similarity_score['ProductId'] == product_id].index, inplace=True)
    target_item_similarity_score.head(10)
    # select items that have been rated by the targeted user
    target_user_rated = rating_norm.loc[[target_user]].dropna(axis=1, how='all').reset_index()
    target_user_rated.drop(target_user_rated.iloc[:, 0:1], inplace=True, axis=1)
    target_userid_rated = target_user_rated.T
    target_userid_rated.reset_index(inplace=True)
    target_userid_rated .columns = ["ProductId","target_user_rating"]
    # target_userid_rated.reset_index(inplace=True)
    target_userid_rated.head(10)
    # join target_user_rated and target_item_similarity_score
    ratings_with_item_sim = target_userid_rated.merge(target_item_similarity_score, on= "ProductId", how = "inner")
    ratings_with_item_sim.head(10)
    # Select top 5 products that are most similar to the target product and have been rated by the target user
    top5_items = ratings_with_item_sim.sort_values(by="similarity to the targeted product", ascending= False).iloc[0:5,:]
    #Calculate the predicted rating using weighted average of similarity
    try:
        predicted_rating = round(np.average(top5_items['target_user_rating'],
                                        weights=top5_items['similarity to the targeted product']), 2)
    except:
        return 0
    return predicted_rating



In [99]:
# build the similarity dictionary
target_user = 1
top_num_recommend = 5
item_rank = {}
for pid in df_re.movieId.unique():
    item_rank[pid] = IBRE(target_user, pid)



In [100]:
# show recommended movies
res = set()
for i, v in sorted(item_rank.items(), key=lambda item: item[1], reverse=True):
    cur_title = df[df['movieId'] == i].original_title.unique()[0]
    res.add(cur_title)
    if len(res) == top_num_recommend:
        break
print("The top recommendation movies are (showed in ID) are: ")
print(res)

The top recommendation movies are (showed in ID) are: 
{'Турецкий гамбит', 'Enigma', 'Die Geschichte vom weinenden Kamel', 'Utvandrarna', 'The Sentinel'}
