In [1]:
import pandas as pd
import numpy as np
from operator import itemgetter

##### For this code to work, a file called ```user_ratings_.csv``` must exist containing the following columns: ```userId, movieId, enjoyRating, meaningRating``` plus a column per character strength with names as in the movie dataset ```final_movie_dataset.csv```

In [2]:
movie_df = pd.read_csv('./final_movie_dataset.csv', low_memory=False)
user_ratings_df = pd.read_csv('./user_ratings.csv', delimiter=';', low_memory=False)

In [3]:
has_ch_strengths = movie_df['averageMeaningRating'] > 0.0
ch_movie_df = movie_df[has_ch_strengths]

##### The dataframe ```ch_movie_df``` contains only movies which have character strengths data. The recommendations can only be done for those.

In [4]:
def cosine_similarity(a, b): 
    return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [5]:
def get_user_score(user_ratings_df, user_id):
    '''
    Calculates the user profile's score based on character strengths and
    meaningful score. Only movies with meaningful score > 3.0 are taken
    into account. 
    '''
    
    if user_id not in user_ratings_df['userId'].unique():
        raise Exception(f"The user with id {user_id} is not in the database.")
    
    USER_ID_COL = 0 
    MEANING_RATING_COL = 3
    
    relevant_df = user_ratings_df[(user_ratings_df['userId'] == user_id) & (user_ratings_df['meaningRating'] >= 3.0)]
    meaning_scores = relevant_df.iloc[:, 3].values
    
    # We want every value of meaning_scores to multiply a whole row of character strengths.
    # This is done by using a concept called "broadcasting"
    ch_strs = relevant_df.iloc[:, np.arange(4, len(relevant_df.columns))] * meaning_scores[:, np.newaxis]
    
    # we sum every character strength weighted score and we normalize it
    score = ch_strs.sum()/ch_strs.sum().sum()
        
    return score

In [6]:
def get_movie_score(movie_df, movie_id, max_meaningful_votes, general_score):
    '''
    Calculate a single movie's score using its character strengths and 
    meaningful score. As the final score is calculated using a bayesian
    estimate, weight and a-priori score parameters are used.
    '''
    
    relevant_df = movie_df[movie_df['movieId'] == movie_id]

    meaningful_scores = relevant_df['averageMeaningRating'].values[0]
    weight = relevant_df['numMeaningVotes'].values[0]/max_meaningful_votes

    ch_strs = relevant_df.iloc[:, np.arange(10, len(relevant_df.columns))] * meaningful_scores
    movie_score = ch_strs.sum() / ch_strs.sum().sum()
    
    return weight*movie_score + (1-weight)*general_score

In [7]:
def get_user_recommendations(user_id, ch_movie_df, user_ratings_df):
    '''
    Returns a user's ranked recommendations by calculating the cosine
    similarity of its profile with every movie in the db.
    '''
    user_score = get_user_score(user_ratings_df, user_id)
    
    # df with only the character strengths
    all_ch_strs = ch_movie_df.iloc[:, np.arange(10, len(ch_movie_df.columns))] * np.array(ch_movie_df['averageMeaningRating'])[:, np.newaxis]
    # normalize very row (every movie)
    all_ch_strs = all_ch_strs.div(all_ch_strs.sum(axis=1), axis=0)
    
    # calculate the general score by averaging the scores of every movie
    general_score = all_ch_strs.mean(axis=0)
    
    max_meaningful_votes = ch_movie_df['numMeaningVotes'].max()

    movie_cosine_sim = {}

    # calculate the cosine similarity between scores of every movie and the user's.
    for movie in ch_movie_df['movieId']:
        movie_cosine_sim[movie] = cosine_similarity(user_score, get_movie_score(ch_movie_df, movie, max_meaningful_votes, general_score))

    # sort the movies by similarity score (higher means better)
    sorted_similarities = {k: v for k, v in sorted(movie_cosine_sim.items(), key=itemgetter(1), reverse=True)}
    
    # this prints the user score together with the score of the top ranked movie
    #print(np.array(user_score), np.array(get_movie_score(ch_movie_df, list(sorted_similarities.keys())[0], max_meaningful_votes, general_score)))
    
    return list(sorted_similarities.keys())[:12], list(sorted_similarities.values())[:12]

In [8]:
recommendations_id, sim_scores = get_user_recommendations(0, ch_movie_df, user_ratings_df)

print('Recommendations for user:\n\nRank\t{:<60} {}'.format("Movie title", "Cosine similarity"))
for i, recom_id in enumerate(recommendations_id):
    movie = ch_movie_df[ch_movie_df['movieId'] == recom_id]['primaryTitle'].values[0]
    print('{}\t{:<60} {}'.format(i+1, movie, sim_scores[i]))

Recommendations for user:

Rank	Movie title                                                  Cosine similarity
1	Taste of Cherry                                              0.6415811282184206
2	The Lord of the Rings: The Fellowship of the Ring            0.6415811282184206
3	The Lord of the Rings: The Return of the King                0.6415811282184206
4	The Lord of the Rings: The Two Towers                        0.6415811282184206
5	Life as a House                                              0.6415811282184206
6	Valentin                                                     0.6415811282184206
7	Crash                                                        0.5832853282858642
8	The Fountain                                                 0.5832853282858642
9	Feast of Love                                                0.5832853282858642
10	Star Wars: Episode IV - A New Hope                           0.5703234321017339
11	The Karate Kid                                               0.57