# Project 4: CS 598 Practical Statistical Learning, Fall 2023

## Movie Recommender System

#### Team Members and Responsibilities
* **Neal Ryan** *(nealpr2):* System I + padding logic for System II
* **Kurt Tuohy** *(ktuohy):* System II similarity matrix and core recommendation system
* **Alelign Faris** *(faris2):* Python Dash app

## Install Dependencies

In [1]:
import os
import pandas as pd
import numpy as np
from scipy import sparse  # To save System II similarity matrix in sparse format

### Load Data

In [2]:
#project_root_dir = "Data"
movies_data_filename = "Data/movies.dat"
ratings_data_filename = "Data/ratings.dat"
users_data_filename = "Data/users.dat"
movielens_data_filename = "Data/MovieLens_Dataset.csv"

In [3]:
#Movies
movies = pd.read_csv(movies_data_filename, sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']
detailed_movies = movies.copy()
multiple_idx = pd.Series([("|" in movie) for movie in movies['Genres']])
movies.loc[multiple_idx, 'Genres'] = 'Multiple'

#Ratings
ratings = pd.read_csv(ratings_data_filename, sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings = ratings.drop('Timestamp', axis = 1)

#Users
users = pd.read_csv(users_data_filename, sep='::', engine = 'python', header=None)
users.columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']

# MovieLens user/movie rating matrix
user_movie_df = pd.read_csv("Data/MovieLens_Dataset.csv", delimiter=",")
# Set username as index
user_movie_df.set_index("user", inplace=True)

# Save movie IDs from matrix for later use
movie_id_columns = user_movie_df.columns

# Create crosswalk of movid IDs between System I and System II datasets.
# System I uses integers for movie IDs, while System II prepends the ID with "m".
movie_id_crosswalk = {movie_id: int(movie_id[1:]) for movie_id in movie_id_columns}

In [4]:
# Show list of movies
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Multiple
1,2,Jumanji (1995),Multiple
2,3,Grumpier Old Men (1995),Multiple
3,4,Waiting to Exhale (1995),Multiple
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [5]:
# Show user/movie rating matrix
user_movie_df

Unnamed: 0_level_0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u1,5.0,,,,,,,,,,...,,,,,,,,,,
u10,5.0,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,5.0,,,,,,,,,,...,,,,,,,,,,
u1001,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u995,,,,,,,,,,,...,,,,,,,,,,
u996,4.0,,,,,,,,,,...,,,,,,,,,,3.0
u997,4.0,,,,,,,,,,...,,,,,,,,,,
u998,,,,,,,,,,,...,,,,,,,,,,


## System I: Recommendation Based on Genres

### Process Data

#### Normalize Ratings

In [7]:
user_avg_ratings = ratings.groupby('UserID')['Rating'].mean().reset_index()
user_avg_ratings.columns = ['UserID', 'Avg_User_Rating']
user_avg_ratings

Unnamed: 0,UserID,Avg_User_Rating
0,1,4.188679
1,2,3.713178
2,3,3.901961
3,4,4.190476
4,5,3.146465
...,...,...
6035,6036,3.302928
6036,6037,3.717822
6037,6038,3.800000
6038,6039,3.878049


In [8]:
ratings = pd.merge(ratings, user_avg_ratings, on='UserID')
ratings['Normalized_Rating'] = ratings['Rating'] / ratings['Avg_User_Rating']
ratings

Unnamed: 0,UserID,MovieID,Rating,Avg_User_Rating,Normalized_Rating
0,1,1193,5,4.188679,1.193694
1,1,661,3,4.188679,0.716216
2,1,914,3,4.188679,0.716216
3,1,3408,4,4.188679,0.954955
4,1,2355,5,4.188679,1.193694
...,...,...,...,...,...
1000204,6040,1091,1,3.577713,0.279508
1000205,6040,1094,5,3.577713,1.397541
1000206,6040,562,5,3.577713,1.397541
1000207,6040,1096,4,3.577713,1.118033


#### Full Genre Capture

In [9]:
genre_dummies = detailed_movies['Genres'].str.get_dummies(sep='|')
detailed_movies = pd.concat([detailed_movies, genre_dummies], axis=1)
detailed_movies

Unnamed: 0,MovieID,Title,Genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Padding Function

To be used when the length of the current returned dataframe is not equal to 10 (or some specified value).
Mostly useful as a helper function for collaborative filtering

In [10]:
def pad_recommendations(current_recommendations_df, num=10, genre='any', metric='Score'):
    #Pad any recommendation dataframe up to the specified number with the best 
    #movies from the genre (if specified, best in general if not)
    #Useful for sparse recommendations

    num_to_pad = num - len(current_recommendations_df)
    if num_to_pad==0:
        return current_recommendations_df
    
    if genre== 'any':
        pad_movies = current_recommendations_df
    else:
        genre_filter = current_recommendations_df[genre] == 1
        pad_movies = current_recommendations_df[genre_filter]
    sorted_genre_movies = pad_movies.sort_values(by=metric, ascending=False)
    sorted_genre_movies = sorted_genre_movies[~sorted_genre_movies['MovieID'].isin(current_recommendations_df['MovieID'])]
    top_genre_movies = sorted_genre_movies.head(num_to_pad)[['MovieID', 'Title', metric]]
    
    return top_genre_movies

### Determine Rating Score

In [11]:
movie_avg_ratings = ratings.groupby('MovieID').agg({'Rating': ['mean', 'count'], 'Normalized_Rating': 'mean'}).reset_index()
movie_avg_ratings.columns = ['MovieID', 'Avg_Rating', 'Num_Ratings', 'Avg_Normalized_Rating']
movies_with_ratings = pd.merge(detailed_movies, movie_avg_ratings, on='MovieID', how='left')
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.sort_values(by='Avg_Normalized_Rating', ascending=False)


Unnamed: 0,MovieID,Title,Genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Avg_Rating,Num_Ratings,Avg_Normalized_Rating
3313,3382,Song of Freedom (1936),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5.0,1.0,2.594340
3254,3323,Chain of Fools (2000),Comedy|Crime,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,3.0,1.0,1.556604
553,557,Mamma Roma (1962),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.5,2.0,1.517864
574,578,"Hour of the Pig, The (1993)",Drama|Mystery,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,4.5,2.0,1.410178
748,758,"Jar, The (Khomreh) (1992)",Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.0,1.0,1.392796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
677,684,Windows (1980),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,1.0,0.300000
3140,3209,"Loves of Carmen, The (1948)",Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,1.0,0.292453
1291,1311,Santa with Muscles (1996),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1.0,7.0,0.281847
1406,1430,Underworld (1997),Thriller,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1.0,1.0,0.268789


Obviously this favors movies that have a low number of ratings, so we now look to make a scoring function that combines the value of more rating data and the average normalized rating

We place more weight on the normalized rating, but implement a weight on the number of ratings mainly to ensure that values of 1 rating don't commonly end up at the top

In [12]:
weight_normalized_rating = 0.9 
weight_num_ratings = 1-weight_normalized_rating
max_num_ratings = 1000

movies_with_ratings['Capped_Num_Ratings'] = movies_with_ratings['Num_Ratings'].clip(upper=max_num_ratings)

movies_with_ratings['Score'] = (
    weight_normalized_rating * movies_with_ratings['Avg_Normalized_Rating'] +
    weight_num_ratings * movies_with_ratings['Capped_Num_Ratings']
)

movies_with_ratings.drop('Capped_Num_Ratings', axis=1, inplace=True)

movies_with_ratings.sort_values(by='Score', ascending=False)

Unnamed: 0,MovieID,Title,Genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,Avg_Rating,Num_Ratings,Avg_Normalized_Rating,Score
315,318,"Shawshank Redemption, The (1994)",Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4.554558,2227.0,1.238783,101.114905
49,50,"Usual Suspects, The (1995)",Crime|Thriller,0,0,0,0,0,1,0,...,0,0,0,1,0,0,4.517106,1783.0,1.235867,101.112281
523,527,Schindler's List (1993),Drama|War,0,0,0,0,0,0,0,...,0,0,0,0,1,0,4.510417,2304.0,1.223836,101.101452
847,858,"Godfather, The (1972)",Action|Crime|Drama,1,0,0,0,0,1,0,...,0,0,0,0,0,0,4.524966,2223.0,1.223737,101.101363
1180,1198,Raiders of the Lost Ark (1981),Action|Adventure,1,1,0,0,0,0,0,...,0,0,0,0,0,0,4.477725,2514.0,1.217865,101.096079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3391,3460,Hillbillys in a Haunted House (1967),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1.000000,1.0,0.300189,0.370170
677,684,Windows (1980),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1.000000,1.0,0.300000,0.370000
3140,3209,"Loves of Carmen, The (1948)",Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1.000000,1.0,0.292453,0.363208
1406,1430,Underworld (1997),Thriller,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1.000000,1.0,0.268789,0.341910


### Create Function to Retrieve the top 10 scores from a particular genre

In [13]:
genres = movies['Genres'].unique()

def top_movies_by_genre(genre, df=movies_with_ratings, metric='Score', num=10):
    #Recommend the top num scoring movies from the genre specified
    
    genre_filter = df[genre] == 1
    genre_movies = df[genre_filter]
    sorted_genre_movies = genre_movies.sort_values(by=metric, ascending=False)
    top_genre_movies = sorted_genre_movies.head(num)[['MovieID', 'Title', metric]]
    top_genre_movies = pad_recommendations(top_genre_movies)

    return top_genre_movies

In [14]:
genre = 'Comedy'
top_action_movies = top_movies_by_genre(genre)
top_action_movies


Unnamed: 0,MovieID,Title,Score
1120,1136,Monty Python and the Holy Grail (1974),101.063561
2789,2858,American Beauty (1999),101.063404
2255,2324,Life Is Beautiful (La Vita è bella) (1997),101.060495
1179,1197,"Princess Bride, The (1987)",101.059733
1215,1234,"Sting, The (1973)",101.057779
1258,1278,Young Frankenstein (1974),101.047442
2735,2804,"Christmas Story, A (1983)",101.043719
3045,3114,Toy Story 2 (1999),101.042635
1268,1288,This Is Spinal Tap (1984),101.029096
1284,1304,Butch Cassidy and the Sundance Kid (1969),101.025586


## System II: Recommendation Based on IBCF

### Generate full movie/movie similarity matrix

Based on the MovieLens user/movie rating matrix loaded above.

Compute cosine similarity for each pair of movies that shares 3 or more raters.

Skip computing similarity of each movie to itself. Leave those matrix entries as NaN.

In [72]:
# Add column to movies_with_ratings that's the Movie ID prepended with "m".
# This lets us link to the movie IDs used in System II.
movies_with_ratings["char_movie_id"] = "m" + movies_with_ratings["MovieID"].astype(str)

In [46]:
# Convert the user/movie ratings dataframe to a matrix
user_movie_matrix = user_movie_df.to_numpy()

In [47]:
# Center each row to handle tough and easy reviewers. Ignore missing values.
user_movie_matrix = user_movie_matrix - np.nanmean(user_movie_matrix, axis=1)[:, np.newaxis]

In [48]:
# Identify non-NaN ratings.
rating_not_na = np.logical_not(np.isnan(user_movie_matrix))

In [49]:
# Save the number of movies in the dataset
n_movies = user_movie_matrix.shape[1]

In [50]:
# Initialize all entries in similarity matrix to NaN.
movie_S = np.full((n_movies, n_movies), fill_value=np.NAN, dtype="float64")

# The minimum shared ratings for a pair of movies to be included in the similarity matrix
min_shared_ratings = 3

In [51]:
# Populate similarity matrix using cosine measure.

# For each pair of movies, compute the similarity-matrix value based only on
# the users who rated both movies. There must be at least 3 shared users per pair of movies.

# Loop over pairs of movies.
for i in range(n_movies-1):
    for j in range(i+1, n_movies):

        # Indexes of users who rated both movies of pair (i, j)
        rating_pair_indexes = np.logical_and(rating_not_na[:, i], rating_not_na[:, j])

        # Count the users who rated both movies.
        num_rating_pairs = np.sum(rating_pair_indexes)

        # If movies i and j have at least 3 pairs of ratings, compute their similarity.
        if (num_rating_pairs >= min_shared_ratings):
            # Get ratings left by the users who rated both
            movie_i_shared_ratings = user_movie_matrix[rating_pair_indexes, i]
            movie_j_shared_ratings = user_movie_matrix[rating_pair_indexes, j]
            
            # Compute cosine similarity components
            cosine_similarity_numerator = np.dot(movie_i_shared_ratings, movie_j_shared_ratings)
            cosine_similarity_denominator = np.sqrt(np.dot(movie_i_shared_ratings, movie_i_shared_ratings)) * \
                    np.sqrt(np.dot(movie_j_shared_ratings, movie_j_shared_ratings))
            
            # Store the similarity in S, the similarity matrix.
            movie_S[i, j] = 0.5 + (0.5 * cosine_similarity_numerator / cosine_similarity_denominator)
            # Make sure the similarity matrix is symmetric
            movie_S[j, i] = movie_S[i, j]

# Show the full movie/movie similarity matrix
movie_S

array([[       nan, 0.51210553, 0.39199995, ..., 0.5140432 , 0.38377183,
        0.41450545],
       [0.51210553,        nan, 0.54745829, ..., 0.66873273, 0.44828951,
        0.60081163],
       [0.39199995, 0.54745829,        nan, ..., 0.26957569, 0.47892265,
        0.6128149 ],
       ...,
       [0.5140432 , 0.66873273, 0.26957569, ...,        nan, 0.64263547,
        0.4606457 ],
       [0.38377183, 0.44828951, 0.47892265, ..., 0.64263547,        nan,
        0.64272702],
       [0.41450545, 0.60081163, 0.6128149 , ..., 0.4606457 , 0.64272702,
               nan]])

In [52]:
# Save similarity matrix as dataframe indexed by movie IDs
movie_S_df = pd.DataFrame(movie_S, index=movie_id_columns, columns=movie_id_columns)

### Cull the similarity matrix

For each row, retain only the 30 highest similarity values. Set the rest to NaN.

In [53]:
# First identify how many non-NaN similarity values each movie has.
non_nan_counts = np.sum(np.logical_not(np.isnan(movie_S)), axis=1)

In [54]:
# Initialize matrix of NaN values, same shape as the similarity matrix.
# Will fill these with top 30 similarity scores for each movie.
highest_S = np.full((n_movies, n_movies), fill_value=np.NAN, dtype="float64")

# The maximum similarity scores to retain for each movie.
max_sim_scores = 30

In [55]:
# Loop over rows of similarity matrix and fill in the empty culled matrix.

for i in range(movie_S.shape[0]):
    # Only clip the row if it has more than the max allowed non-NaN similarity scores
    if (non_nan_counts[i] > max_sim_scores):
        # Get row of similarity scores
        movie_S_i = movie_S[i, :]
        # Get indexes of the highest 30 similarity scores in the movie_S row
        movie_S_i_indexes_of_highest = np.sort(np.argsort(np.nan_to_num(movie_S_i, nan=0.0))[-max_sim_scores:])
        # Copy the 30 highest similarity values to highest_S
        highest_S[i, movie_S_i_indexes_of_highest] = movie_S[i, movie_S_i_indexes_of_highest]
    # If movie has 30 or less similarity scores, just copy them to highest_S
    else:
        highest_S[i, :] = movie_S[i, :]

# Show the culled similarity matrix
highest_S

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

#### Check results. How many non-NaN similarities are there for each row?

In [56]:
row_sim_counts = np.sum(np.logical_not(np.isnan(highest_S)), axis=1)
sim_counts_df = pd.DataFrame({"sim_count": row_sim_counts})
sim_counts_df.groupby(["sim_count"])["sim_count"].count()

sim_count
0      207
1        3
2        1
3        2
4        1
5        1
6        4
8        1
11       1
12       2
14       1
16       1
17       1
20       1
21       1
28       1
29       3
30    3474
Name: sim_count, dtype: int64

### Save the culled similarity matrix to file

In [57]:
# Convert the culled similarity matrix to sparse format
highest_S_sparse = sparse.csr_matrix(highest_S)

In [58]:
# Save to file
sparse.save_npz("Data/movie_similarity_matrix.npz", highest_S_sparse)

#### Sample code to read in the sparse similarity matrix and re-inflate it

In [59]:
# Read in the sparse matrix
highest_S_sparse_read_back = sparse.load_npz("Data/movie_similarity_matrix.npz")

# Convert sparse matrix to dense
highest_S_dense_read_back = highest_S_sparse_read_back.toarray()

# Convert dense matrix to dataframe, indexed by movie names
highest_S_read_back_df = pd.DataFrame(highest_S_dense_read_back, index=movie_id_columns, columns=movie_id_columns)

### Display pairwise similarity values for selected movie IDs

Use the full similarity matrix, since the culled matrix only returns NaN similarity values.

In [60]:
selected_movie_ids = ["m1", "m10", "m100", "m1510", "m260", "m3212"]

In [61]:
# Display seven decimal places
movie_S_df.loc[selected_movie_ids, selected_movie_ids].style.format("{:.7f}")

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,,0.5121055,0.3919999,,0.7411482,
m10,0.5121055,,0.5474583,,0.5343338,
m100,0.3919999,0.5474583,,,0.3296943,
m1510,,,,,,
m260,0.7411482,0.5343338,0.3296943,,,
m3212,,,,,,


### Recommendation logic

#### Helper function to list top n movies from System I

Will use this to pad out recommendations when System II logic doesn't fetch enough of them.

In [73]:
def top_alternate_movies(exclude_movie_ids, global_movie_df=movies_with_ratings, metric='Score', num=10):
    """
    Recommend the top num scoring movies that are not in the given list.
    Use this to pad out similarity-based recommendations when there are too few.
    
    Arguments:
    * exclude_movie_ids: integer array of movie IDs that are already in the recommendation list
    * global_movie_df: dataframe of 
    """
    
    # Sort global movie list by score, highest first
    sorted_movies = global_movie_df.sort_values(by=metric, ascending=False)
    # Exclude movies that are in the exclusion list
    sorted_movies = sorted_movies[~sorted_movies['char_movie_id'].isin(exclude_movie_ids)]
    top_num_movies = sorted_movies.head(num)[['char_movie_id', 'Title']]
    # Rename columns for compatibility with myIBCF() below.
    top_num_movies.columns = ['movie_id', 'title']
    # No predicted rating for globally popular movies. Create column for compatibility with myIBCF().
    top_num_movies["predicted_rating"] = np.NAN   

    return top_num_movies

#### Function myIBCF to recommend top 10 movies to new user

In [106]:
def myIBCF(newuser, n_req_recommendations=10, min_rating_recommended=3.0, min_similarity_for_addition=0.9):
    """
    Given a viewer's movie ratings and a matrix of movie similarities,
    recommend different movies to the user.
    Recommend exactly n_req_recommendations movies.
    
    Arguments:
    * newuser: 1-D array of user's movie ratings. Unrated films have NaN entries.
    * n_req_recommendations: the required number of recommendations to return.
    * min_rating_recommended: only recommend movies with this predicted rating and above.
    * min_similarity_for_addition: If the function generates less than n_req_recommendations,
        find the most similar movies to the recommended ones and pad out the list.
        "min_similarity_for_addition" is the minimum cutoff to consider an additional movie
        similar enough to the ones already recommended.
    """
    
    # Initialize return values
    pred_ratings_sorted = np.full((n_req_recommendations), np.NAN)
    # Recommended movies for user
    recommended_movie_ids = np.full((n_req_recommendations), np.NAN)
    
    
    # Get index values of movies the new user has rated.
    rated_index = np.squeeze(np.argwhere(np.logical_not(np.isnan(newuser)))).tolist()
    
    # Identify movies the new user hasn't rated yet.
    # Will use these to skip rating movies a user has already rated.
    unrated_index = np.squeeze(np.argwhere(np.isnan(newuser))).tolist()
    
    # IDs of unrated movies in the reduced matrix
    unrated_movie_ids = movie_id_columns[unrated_index]
    
    # Initialize the number of recommendations to return
    n_recommended = 0

    # If user rated at least one movie, find similar movies to recommend
    if (len(unrated_index) < highest_S.shape[0]):
    
        # To make predictions, create reduced version of the culled similarity matrix.
        # The user's rated movies will be columns and unrated movies will be rows.
        # Fill NaN values with zero to make calculations easier.

        S_for_pred = np.nan_to_num(highest_S[unrated_index, :][:, rated_index], nan=0.0)
        
        # Numerators of predicted ratings
        pred_numerators = np.dot(S_for_pred, newuser[rated_index])

        # Initialize predicted rating matrix.
        # Force NaNs to zero to help sort ratings.
        pred_ratings = np.full((pred_numerators.shape), fill_value=0.0, dtype="float64")

        # Compute predicted ratings.
        
        # Similarity matrix for predictions is two-dimensional: normal case.
        if (S_for_pred.ndim > 1):
            # Denominators of predicted ratings
            pred_denominators = np.sum(S_for_pred, axis=1)
            # Indexes of nonzero denominators: movies with at least one similarity value to something the new user rated
            nonzero_denom_index = np.argwhere(pred_denominators > 0.0)
            # Generate predictions
            pred_ratings[nonzero_denom_index] = pred_numerators[nonzero_denom_index] / pred_denominators[nonzero_denom_index]
        # If only one similarity value, treat it as a scalar.
        else:
            pred_denominators = np.sum(S_for_pred)
            if (pred_denominators > 0):
                # Generate predictions
                pred_ratings = pred_numerators / pred_denominators

        # Count the predictions that are equal to or greater than min_rating_recommended.
        n_recommended = np.sum(pred_ratings >= min_rating_recommended)
        # Set a ceiling on the number of recommendations to return.
        n_recommended = min(n_recommended, n_req_recommendations)
        
    # If any movies recommended, sort them by descending rating
    if (n_recommended > 0):

        # Sort the predicted ratings, highest first. Return max n_req_recommendations ratings.
        pred_rating_sort_index = np.argsort(pred_ratings)[::-1][:n_recommended]
        # Sort movie IDs in the same order
        recommended_movie_ids = unrated_movie_ids[pred_rating_sort_index]
        pred_ratings_sorted = pred_ratings[pred_rating_sort_index]
        
        # Create dataframe of recommendations to return.
        # Include movie ID, title, and predicted rating.

        recommended_movies = pd.DataFrame({
            "movie_id": recommended_movie_ids,
            "title": [str(movies[movies.MovieID == movie_id_crosswalk[movie_id]]["Title"]) for movie_id in recommended_movie_ids],
            "predicted_rating": pred_ratings_sorted
        })
        
        # If not enough recommendations, fill in the rest based on System I logic.
        if (n_recommended < n_req_recommendations):
            # Don't include existing recommendations in the list of additional movies.
            additional_movies = top_alternate_movies(exclude_movie_ids=recommended_movie_ids, num=n_req_recommendations - n_recommended)
            recommended_movies = pd.concat([recommended_movies, additional_movies])
            
    # Either user hasn't rated any movies, or no recommendations found. Use System I logic to make recommendations.
    else:
        recommended_movies = top_alternate_movies(exclude_movie_ids=[], num=n_req_recommendations)

         
    # TO DO IF TIME: pad out recommendations with movies that are similar to what's in the recommendation list.
    #
    # Steps
    #
    # 1) Find movies that are both unrated and un-recommended.
    #    a) Slice highest_S by rows = intersection(unrated, unrecommended) and columns = recommended.
    # 2) Null out similarity values < min_similarity_for_addition.
    # 3) For any remaining rows with similarity values:
    #    a) Find the row with the highest similarity value.
    #    b) Add row movie_ID to the recommended list.
    #    c) Remove row from the matrix and add it as a column.
    # 4) If we don't have enough recommendations, fill out the rest with pad_recommendations().
            
    # Return recommended movies
    return recommended_movies

### Test myIBCF. Display top 10 recommendations for three users

#### User u1181

In [75]:
u1181_ratings = user_movie_df.loc["u1181", :].to_numpy()

In [100]:
u1181_recommended_movies = myIBCF(u1181_ratings)
u1181_recommended_movies

Unnamed: 0,movie_id,title,predicted_rating
0,m3732,"3663 Fury, The (1978)\nName: Title, dtype: ...",5.0
1,m749,"739 Man from Down Under, The (1943)\nName: ...",4.526559
2,m3899,"3829 Circus (2000)\nName: Title, dtype: object",4.526066
3,m1734,1685 My Life in Pink (Ma vie en rose) (1997...,4.0
4,m249,"246 Immortal Beloved (1994)\nName: Title, d...",4.0
5,m2361,"2292 Pink Flamingos (1972)\nName: Title, dt...",4.0
6,m1253,"1233 Day the Earth Stood Still, The (1951)\...",4.0
7,m3752,"3683 Me, Myself and Irene (2000)\nName: Tit...",4.0
8,m2082,"2013 Mighty Ducks, The (1992)\nName: Title,...",4.0
9,m3789,"3720 Pawnbroker, The (1965)\nName: Title, d...",4.0


#### User u1351

In [77]:
u1351_ratings = user_movie_df.loc["u1351", :].to_numpy()

In [101]:
u1351_recommended_movies = myIBCF(u1351_ratings)
u1351_recommended_movies

Unnamed: 0,movie_id,title,predicted_rating
0,m1514,1479 Temptress Moon (Feng Yue) (1996)\nName...,5.0
1,m1780,1722 Ayn Rand: A Sense of Life (1997)\nName...,5.0
2,m1901,"1832 Dear Jesse (1997)\nName: Title, dtype:...",5.0
3,m2061,"1992 Full Tilt Boogie (1997)\nName: Title, ...",5.0
4,m853,"842 Dingo (1992)\nName: Title, dtype: object",5.0
5,m1871,"1802 Friend of the Deceased, A (1997)\nName...",5.0
6,m2063,1994 Seventh Heaven (Le Septième ciel) (199...,5.0
7,m1532,"1495 Sprung (1997)\nName: Title, dtype: object",5.0
8,m560,"556 Beans of Egypt, Maine, The (1994)\nName...",5.0
9,m2869,"2800 Separation, The (La Séparation) (1994)...",5.0


#### Hypothetical user who rates movie “m1613” with 5 and movie “m1755” with 4

In [79]:
m1613_index = np.where(movie_id_columns == "m1613")[0][0]
m1755_index = np.where(movie_id_columns == "m1755")[0][0]

hyp_user_ratings = np.full((movie_id_columns.shape[0]), np.NAN)
hyp_user_ratings[m1613_index] = 5
hyp_user_ratings[m1755_index] = 4

In [102]:
hyp_user_recommended_movies = myIBCF(hyp_user_ratings)
hyp_user_recommended_movies

Unnamed: 0,movie_id,title,predicted_rating
0,m3269,"3200 Forever Young (1992)\nName: Title, dty...",5.0
1,m592,"588 Batman (1989)\nName: Title, dtype: object",5.0
2,m765,"755 Jack (1996)\nName: Title, dtype: object",5.0
3,m1017,1004 Swiss Family Robinson (1960)\nName: Ti...,5.0
4,m74,"73 Bed of Roses (1996)\nName: Title, dtype:...",5.0
5,m2808,"2739 Universal Soldier (1992)\nName: Title,...",5.0
6,m691,"682 Mrs. Winterbourne (1996)\nName: Title, ...",5.0
7,m1688,"1642 Anastasia (1997)\nName: Title, dtype: ...",5.0
8,m2718,2649 Drop Dead Gorgeous (1999)\nName: Title...,5.0
9,m2945,"2876 Mike's Murder (1984)\nName: Title, dty...",5.0


#### User who has rated no movies

In [81]:
no_user_ratings = np.full((movie_id_columns.shape[0]), np.NAN)

In [82]:
default_user_recommended_movies = myIBCF(no_user_ratings)
default_user_recommended_movies

Unnamed: 0,movie_id,title,predicted_rating
315,m318,"Shawshank Redemption, The (1994)",
49,m50,"Usual Suspects, The (1995)",
523,m527,Schindler's List (1993),
847,m858,"Godfather, The (1972)",
1180,m1198,Raiders of the Lost Ark (1981),
257,m260,Star Wars: Episode IV - A New Hope (1977),
740,m750,Dr. Strangelove or: How I Learned to Stop Worr...,
2693,m2762,"Sixth Sense, The (1999)",
892,m904,Rear Window (1954),
901,m913,"Maltese Falcon, The (1941)",


#### User who receives insufficient recommendations

In [104]:
insuffic_user_ratings = np.full((movie_id_columns.shape[0]), np.NAN)

m3407_index = np.where(movie_id_columns == "m3407")[0][0]
insuffic_user_ratings[m3407_index] = 5

In [107]:
padded_user_recommended_movies = myIBCF(insuffic_user_ratings)
padded_user_recommended_movies

Unnamed: 0,movie_id,title,predicted_rating
0,m1210,1192 Star Wars: Episode VI - Return of the ...,5.0
315,m318,"Shawshank Redemption, The (1994)",
49,m50,"Usual Suspects, The (1995)",
523,m527,Schindler's List (1993),
847,m858,"Godfather, The (1972)",
1180,m1198,Raiders of the Lost Ark (1981),
257,m260,Star Wars: Episode IV - A New Hope (1977),
740,m750,Dr. Strangelove or: How I Learned to Stop Worr...,
2693,m2762,"Sixth Sense, The (1999)",
892,m904,Rear Window (1954),
