In [None]:
%load_ext autoreload
%autoreload 2

from Recommender_Lib import *

import pickle
import random
import json

import pandas as pd
import numpy as np

In [2]:
'''
Read in the raw data so we have access to movie titles, genres
force only movie titles to be returned to save space
'''
movies_df, _, _ = read_raw_data(short_circuit_for_movie_titles=True)

In [3]:
'''
Define some users and movies they like.
The following lists asssume an index to index correspondence.
'''
user_names = ["Michael"]
user_genders = ["M"]
user_ages = [32]
user_occupations = ["programmer"]
user_zipcodes = [87103]
# Get the title of a liked movie for input. If do not have one from user, get one at random.
movies_liked_by_users = [
    [movies_df.iloc[random.randint(0, movies_df.shape[0])][MOVIE_LENS_25M_MOVIE_TITLE_COL]],
]

'''
movies_liked_by_users_idx_lists is a list like:
[
    user_1
    [
        [idxs_mapped_to_by_movie_title_1, idxs_mapped_to_by_movie_title_2, ... ],
        [idxs_mapped_to_by_movie_title_1, idxs_mapped_to_by_movie_title_2, ... ],
        ...
    ]
    user_2
    [
        [idxs_mapped_to_by_movie_title_1, idxs_mapped_to_by_movie_title_2, ... ],
        [idxs_mapped_to_by_movie_title_1, idxs_mapped_to_by_movie_title_2, ... ],
        ...
    ]
]
'''
movies_liked_by_users_idx_lists = []
for movies in movies_liked_by_users:
    idxs_i = []
    for title in movies:
        idxs_i.append(movie_title_to_mat_idx(title, movies_df))

    movies_liked_by_users_idx_lists.append(idxs_i)

for i, users_movie_list in enumerate(movies_liked_by_users_idx_lists):
    for movie_idxs in users_movie_list:
        if len(movie_idxs) == 0:
            raise Exception(f"List {movies_liked_by_users[i]} has a movie in it that is not known by the DB.")

In [26]:
'''
Get our giant (~14GB) partitioned, disk-cached, cosine similarity matrix.
For cosine similarities, we think the close to 1 sim(m1, m2), the more similair m1 and m2 are and 
the closer to -1, the more disimilair.

First, load in the cosine sim matrix metadata, then make a PartitionedCosineSimMatrix object.
'''
partition_metadata = json.load(open(PATH_TO_COSINE_SIMS_PARTITION_SPECS))
num_partitions = int(partition_metadata[SPEC_PARTITIONS])
partition_paths = partition_metadata[SPEC_PARTITION_PATHS]
rows_per_partition = int(partition_metadata[SPEC_ROWS_PER_PARTITION])
extra_rows_in_last_partition = int(partition_metadata[SPEC_EXTRA_ROWS_IN_LAST_PARTITION])

verify_cosine_sims_mat_exists(partition_paths)
partitioned_cosine_sim_matrix = PartitionedCosineSimMatrix(
    partition_paths, num_partitions, rows_per_partition, extra_rows_in_last_partition)

In [38]:
how_similair = 0.95
total_movies = movies_df.shape[0]
for i, user_movie_list in enumerate(movies_liked_by_users_idx_lists):
    print(f"For user {user_names[i]}:")
    for movie_idxs in user_movie_list:
        for movie_idx in movie_idxs:
            similar_to_i = partitioned_cosine_sim_matrix.get_similar_movie_idxs(
                movie_idx,
                how_similair)
            movie_i = movies_df.iloc[movie_idx][MOVIE_LENS_25M_MOVIE_TITLE_COL]
            if len(similar_to_i) > 0:
                sim_movie_titles = movies_df.iloc[similar_to_i][MOVIE_LENS_25M_MOVIE_TITLE_COL].to_list()
                sim_movie_titles.remove(movie_i)
                print(
                    f"\tBased off movie {movie_i}, we recommend the following similar movies.\n\t\t",
                    "\n\t\t ".join(sim_movie_titles))
            else:
                print("\tNo matches found for", movie_i)

For user Michael:
	Based off movie Tammy Tell Me True (1961), we recommend the following similar movies.
		 Camille 2000 (1969)
		 Ill-Fated Love (Doomed Love) (Amor de Perdição) (1979)
		 Bridal Party in Hardanger, The (Brudeferden i Hardanger) (1926)
		 One Night in the Tropics (1940)
		 Black Beauty (1946)
		 Vice Versa (1948)
		 Upstairs and Downstairs (1959)
		 Value for Money (1955)
		 The Virgin and the Gypsy (1970)
		 Love, Passion and Pleasure (1972)
		 Monte Cristo (1922)
		 Woman Buried Alive (1973)
		 The Blue Angel (1959)
		 A Rage To Live (1965)
		 From a Roman Balcony (1960)
		 Il Giovane Normale (1969)
		 Barbara (1997)
		 Manslaughter (1930)
		 The Bookshop (2017)
		 Kelly + Victor (2013)
		 Not Cinderella's Type (2018)


In [4]:
'''
Now, lets recommend some movies based off what movies are liked by those who like what the given users like
using the ratings matrix. Also, need the clusters below, + need the number of users now from the clusters
model, so make it now too.
'''
users_clustering = pickle.load(open(PATH_TO_USER_KMEANS_MODEL, "rb"))
ratings_matrix_mgr = RatingsMatrixManager(
    PATH_TO_PROCESSED_RATINGS_DATA,
    movies_df[MOVIE_LENS_25M_MOVIE_TITLE_COL],
    len(users_clustering.labels_))

In [36]:
number_of_movies_to_recommend = 20
for i, user_movie_list in enumerate(movies_liked_by_users_idx_lists):
    print(f"For user {user_names[i]}:")
    for movie_idxs in user_movie_list:
        for movie_idx in movie_idxs:
            movies_liked_by_those_who_like_what_user_likes = ratings_matrix_mgr.find_movies_liked_by_users_that_like_movie(
                movie_idx,
                number_of_movies_to_recommend)
            movie_i = movies_df.iloc[movie_idx][MOVIE_LENS_25M_MOVIE_TITLE_COL]
            if len(movies_liked_by_those_who_like_what_user_likes) > 0:
                print(
                    f"\tUsers who like {movie_i} also like the following movies.\r\n\t\t",
                    "\n\t\t ".join(movies_liked_by_those_who_like_what_user_likes))
            else:
                print("No one else liked", movie_i)

For user Michael:
	Users who like Tammy Tell Me True (1961) also like the following movies.
		 Jumanji (1995)
		 Father of the Bride Part II (1995)
		 Only You (1994)
		 American President, The (1995)
		 Mrs. Doubtfire (1993)
		 Sense and Sensibility (1995)
		 Rudy (1993)
		 Clueless (1995)
		 Schindler's List (1993)
		 Forrest Gump (1994)
		 Secret Garden, The (1993)
		 Bridges of Madison County, The (1995)
		 Apollo 13 (1995)
		 Sleepless in Seattle (1993)
		 Three Musketeers, The (1993)
		 Walk in the Clouds, A (1995)
		 Much Ado About Nothing (1993)
		 Little Women (1994)
		 Little Princess, A (1995)
		 Legends of the Fall (1994)


In [13]:
'''
Now, can embed info about a user, then use it to find users similar with the kmeans cluster.
Then, look at what those users rated high, and recommend some of them.
First get the user embedding specs.
'''
user_embedding_specs = json.load(open(PATH_TO_USER_EMBEDDING_SPECS))
unique_vocab = list(map(int, user_embedding_specs[SPEC_UNIQUE_VOCAB]))
output_dim = int(user_embedding_specs[SPEC_OUTPUT_DIM])
zipcode_len = int(user_embedding_specs[SPEC_ZIPCODE_LEN])
male_female_encoder = user_embedding_specs[SPEC_MALE_FEMALE]

In [14]:
encoded_user_genders = list(map(lambda g: encode_genders(g, male_female_encoder), user_genders))
encoded_user_ages = list(map(encode_age, user_ages))
encoded_user_occupations = list(map(encode_occupation, user_occupations))
encoded_user_df = pd.DataFrame({
    0: encoded_user_genders, 1: encoded_user_ages, 2: encoded_user_occupations, 3: user_zipcodes})
encoded_user_df

Unnamed: 0,0,1,2,3
0,21,25,12,87103


In [None]:
'''
Now embed them like our clustering expects them to be (or, as close as we can get, at least)
'''
unique_vocab, users_embedding = embed_users(
    encoded_user_df,
    zipcode_len,
    output_dim,
    encoded_user_df.shape[1],
    unique_vocab)

In [16]:
'''
Classify these users
'''
user_labels = users_clustering.predict(users_embedding)

In [37]:
'''
Make some recommendations based off users most like the given users.
'''
number_of_movies_to_recommend = 20
for i, user_i in enumerate(user_names):
    matching_user_idxs = np.nonzero(users_clustering.labels_ == user_labels[i])[0]
    if len(matching_user_idxs) == 0:
        print(user_i, "is unlike any user.") # this is probably an error case
    else:
        # check the ratings matrix for some of the matching users and see what they
        # like, and recommend the movies that are most liked amongst them all
        most_liked_by_alikes = ratings_matrix_mgr.find_movies_liked_by_alike_users(
            matching_user_idxs,
            number_of_movies_to_recommend)
        print(
            f"Users most like {user_i} like the following movies the most.\n\t\t",
            "\n\t\t ".join(most_liked_by_alikes))

Users most like Michael like the following movies the most.
		 Raiders of the Lost Ark (1981)
		 Twelve Monkeys (1995)
		 Fistful of Dollars, A (1964)
		 Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
		 Good, The Bad and The Ugly, The (1966)
		 Robocop (1987)
		 Star Trek: The Wrath of Khan (1982)
		 Ghostbusters (1984)
		 Mad Max 2 (a.k.a. The Road Warrior) (1981)
		 Men in Black (1997)
		 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
		 As Good As It Gets (1997)
		 Wrong Trousers, The (1993)
		 Willy Wonka and the Chocolate Factory (1971)
		 GoodFellas (1990)
		 Monty Python and the Holy Grail (1974)
		 Boat, The (Das Boot) (1981)
		 M*A*S*H (1970)
		 Independence Day (ID4) (1996)
		 Nikita (La Femme Nikita) (1990)
