In [None]:
%load_ext autoreload
%autoreload 2

from Recommender_Lib import *

import pickle
import random
import json

import pandas as pd
import numpy as np

In [2]:
'''
Read in the raw data so we have access to movie titles, genres
force only movie titles to be returned to save space
'''
movies_df, _, _ = read_raw_data(short_circuit_for_movie_titles=True)

In [13]:
movies_df.iloc[40000]

movieId             158445
title      Frenchie (1950)
Name: 40000, dtype: object

In [15]:
'''
Define some users and movies they like.
The following lists asssume an index to index correspondence.
'''
user_names = ["Michael"]
user_genders = ["M"]
user_ages = [32]
user_occupations = ["programmer"]
user_zipcodes = [87103]
movies_liked_by_users = [
    ["Die Hard"],
]

'''
movies_liked_by_users_idx_lists is a list like:
[
    user_1
    [
        [idxs_mapped_to_by_movie_title_1, idxs_mapped_to_by_movie_title_2, ... ],
        [idxs_mapped_to_by_movie_title_1, idxs_mapped_to_by_movie_title_2, ... ],
        ...
    ]
    user_2
    [
        [idxs_mapped_to_by_movie_title_1, idxs_mapped_to_by_movie_title_2, ... ],
        [idxs_mapped_to_by_movie_title_1, idxs_mapped_to_by_movie_title_2, ... ],
        ...
    ]
]
'''
movies_liked_by_users_idx_lists = []
for movies in movies_liked_by_users:
    idxs_i = []
    for title in movies:
        idxs_i.append(movie_title_to_mat_idx(title, movies_df))

    movies_liked_by_users_idx_lists.append(idxs_i)

for i, users_movie_list in enumerate(movies_liked_by_users_idx_lists):
    for movie_idxs in users_movie_list:
        if len(movie_idxs) == 0:
            raise Exception(f"List {movies_liked_by_users[i]} has a movie in it that is not known by the DB.")

In [16]:
'''
Get our giant (~14GB) partitioned, disk-cached, cosine similarity matrix.
For cosine similarities, we think the close to 1 sim(m1, m2), the more similair m1 and m2 are and 
the closer to -1, the more disimilair.

First, load in the cosine sim matrix metadata, then make a PartitionedCosineSimMatrix object.
'''
partition_metadata = json.load(open(PATH_TO_COSINE_SIMS_PARTITION_SPECS))
num_partitions = int(partition_metadata[SPEC_PARTITIONS])
partition_paths = partition_metadata[SPEC_PARTITION_PATHS]
rows_per_partition = int(partition_metadata[SPEC_ROWS_PER_PARTITION])
extra_rows_in_last_partition = int(partition_metadata[SPEC_EXTRA_ROWS_IN_LAST_PARTITION])

verify_cosine_sims_mat_exists(partition_paths)
partitioned_cosine_sim_matrix = PartitionedCosineSimMatrix(
    partition_paths, num_partitions, rows_per_partition, extra_rows_in_last_partition)

In [17]:
how_similair = 0.95
total_movies = movies_df.shape[0]
for i, user_movie_list in enumerate(movies_liked_by_users_idx_lists):
    print(f"For user {user_names[i]}:")
    for movie_idxs in user_movie_list:
        for movie_idx in movie_idxs:
            similar_to_i = partitioned_cosine_sim_matrix.get_similar_movie_idxs(
                movie_idx,
                how_similair)
            movie_i = movies_df.iloc[movie_idx][MOVIE_LENS_25M_MOVIE_TITLE_COL]
            if len(similar_to_i) > 0:
                sim_movie_titles = movies_df.iloc[similar_to_i][MOVIE_LENS_25M_MOVIE_TITLE_COL].to_list()
                sim_movie_titles.remove(movie_i)
                print(
                    f"\tBased off movie {movie_i}, we recommend the following similar movies.\n\t\t",
                    "\n\t\t ".join(sim_movie_titles))
            else:
                print("\tNo matches found for", movie_i)

For user Michael:
	Based off movie Die Hard: With a Vengeance (1995), we recommend the following similar movies.
		 Hard-Boiled (Lat sau san taam) (1992)
		 Collateral (2004)
		 City on Fire (Lung fu fong wan) (1987)
		 Bourne Ultimatum, The (2007)
		 Fast & Furious 6 (Fast and the Furious 6, The) (2013)
		 one eyed king (2001)
		 The Transporter Refuelled (2015)
		 Vivegam (2017)
		 Peppermint (2018)
	Based off movie Die Hard (1988), we recommend the following similar movies.
		 Heat (1995)
		 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
		 Seven (a.k.a. Se7en) (1995)
		 Usual Suspects, The (1995)
		 Ed Wood (1994)
		 Blade Runner (1982)
		 Wild Bunch, The (1969)
		 Philadelphia Story, The (1940)
		 It Happened One Night (1934)
		 North by Northwest (1959)
		 Apartment, The (1960)
		 Casablanca (1942)
		 Roman Holiday (1953)
		 Wizard of Oz, The (1939)
		 Citizen Kane (1941)
		 Rebecca (1940)
		 Notorious (1946)
		 Adventures of Robin Hood, The (1938)
		 Mr. Smith Goes to Washington (193

In [18]:
'''
Now, lets recommend some movies based off what movies are liked by those who like what the given users like
using the ratings matrix. Also, need the clusters below, + need the number of users now from the clusters
model, so make it now too.
'''
users_clustering = pickle.load(open(PATH_TO_USER_KMEANS_MODEL, "rb"))
ratings_matrix_mgr = RatingsMatrixManager(
    PATH_TO_PROCESSED_RATINGS_DATA,
    movies_df[MOVIE_LENS_25M_MOVIE_TITLE_COL],
    len(users_clustering.labels_))

In [34]:
number_of_movies_to_recommend = 16
for i, user_movie_list in enumerate(movies_liked_by_users_idx_lists):
    print(f"For user {user_names[i]}:")
    for movie_idxs in user_movie_list:
        for movie_idx in movie_idxs:
            movies_liked_by_those_who_like_what_user_likes = ratings_matrix_mgr.find_movies_liked_by_users_that_like_movie(
                movie_idx,
                number_of_movies_to_recommend)
            movie_i = movies_df.iloc[movie_idx][MOVIE_LENS_25M_MOVIE_TITLE_COL]
            if len(movies_liked_by_those_who_like_what_user_likes) > 1:
                movies_liked_by_those_who_like_what_user_likes.remove(movie_i)
                print(
                    f"\tUsers who like {movie_i} also like the following movies.\r\n\t\t",
                    "\n\t\t ".join(movies_liked_by_those_who_like_what_user_likes))
            else:
                print("No one else liked", movie_i)

For user Michael:
	Users who like Die Hard: With a Vengeance (1995) also like the following movies.
		 Die Hard (1988)
		 Matrix, The (1999)
		 Jurassic Park (1993)
		 Seven (a.k.a. Se7en) (1995)
		 Silence of the Lambs, The (1991)
		 Terminator 2: Judgment Day (1991)
		 Apollo 13 (1995)
		 Usual Suspects, The (1995)
		 Forrest Gump (1994)
		 Speed (1994)
		 True Lies (1994)
		 Braveheart (1995)
		 Pulp Fiction (1994)
		 Fugitive, The (1993)
		 Shawshank Redemption, The (1994)
	Users who like Die Hard (1988) also like the following movies.
		 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
		 Matrix, The (1999)
		 Back to the Future (1985)
		 Shawshank Redemption, The (1994)
		 Silence of the Lambs, The (1991)
		 Star Wars: Episode VI - Return of the Jedi (1983)
		 Terminator, The (1984)
		 Star Wars: Episode V - The Empire Strikes Back (1980)
		 Pulp Fiction (1994)
		 Forrest Gump (1994)
		 Indiana Jones and the Last Crusade (1989)
		 Usual Suspects, The

In [20]:
'''
Now, can embed info about a user, then use it to find users similar with the kmeans cluster.
Then, look at what those users rated high, and recommend some of them.
First get the user embedding specs.
'''
user_embedding_specs = json.load(open(PATH_TO_USER_EMBEDDING_SPECS))
unique_vocab = list(map(int, user_embedding_specs[SPEC_UNIQUE_VOCAB]))
output_dim = int(user_embedding_specs[SPEC_OUTPUT_DIM])
zipcode_len = int(user_embedding_specs[SPEC_ZIPCODE_LEN])
male_female_encoder = user_embedding_specs[SPEC_MALE_FEMALE]

In [21]:
encoded_user_genders = list(map(lambda g: encode_genders(g, male_female_encoder), user_genders))
encoded_user_ages = list(map(encode_age, user_ages))
encoded_user_occupations = list(map(encode_occupation, user_occupations))
encoded_user_df = pd.DataFrame({
    0: encoded_user_genders, 1: encoded_user_ages, 2: encoded_user_occupations, 3: user_zipcodes})
encoded_user_df

Unnamed: 0,0,1,2,3
0,21,25,12,87103


In [None]:
'''
Now embed them like our clustering expects them to be (or, as close as we can get, at least)
'''
unique_vocab, users_embedding = embed_users(
    encoded_user_df,
    zipcode_len,
    output_dim,
    encoded_user_df.shape[1],
    unique_vocab)

In [35]:
'''
Classify these users
'''
user_labels = users_clustering.predict(users_embedding)

array([3], dtype=int32)

In [36]:
'''
Make some recommendations based off users most like the given users.
'''
number_of_movies_to_recommend = 35
for i, user_i in enumerate(user_names):
    matching_user_idxs = np.nonzero(users_clustering.labels_ == user_labels[i])[0]
    if len(matching_user_idxs) == 0:
        print(user_i, "is unlike any user.") # this is probably an error case
    else:
        # check the ratings matrix for some of the matching users and see what they
        # like, and recommend the movies that are most liked amongst them all
        most_liked_by_alikes = ratings_matrix_mgr.find_movies_liked_by_alike_users(
            matching_user_idxs,
            number_of_movies_to_recommend)
        print(
            f"Users most like {user_i} like the following movies the most.\n\t\t",
            "\n\t\t ".join(most_liked_by_alikes))

Users most like Michael like the following movies the most.
		 101 Dalmatians (1961)
		 Twelve Monkeys (1995)
		 Seven (Se7en) (1995)
		 Postino, Il (The Postman) (1994)
		 Monty Python and the Holy Grail (1974)
		 Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
		 Wrong Trousers, The (1993)
		 Boat, The (Das Boot) (1981)
		 Close Shave, A (1995)
		 GoodFellas (1990)
		 Women on the Verge of a Nervous Breakdown (1988)
		 Grand Day Out, A (1992)
		 Raiders of the Lost Ark (1981)
		 Nikita (La Femme Nikita) (1990)
		 Year of Living Dangerously (1982)
		 Star Trek: The Wrath of Khan (1982)
		 Eat Drink Man Woman (1994)
		 Ghostbusters (1984)
		 Interview with the Vampire (1994)
		 Independence Day (ID4) (1996)
		 Men in Black (1997)
		 Kramer Vs. Kramer (1979)
		 Professional, The (a.k.a. Leon: The Professional) (1994)
		 Willy Wonka and the Chocolate Factory (1971)
		 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
		 Raise the Red Lantern 