# CS 598 Practical Statistical Learning Project 4, Fall 2023

## Generate similarity matrix for System II

### Load libraries and read in the user/movie rating matrix.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read MovieLens dataset
user_movie_df = pd.read_csv("Data/MovieLens_Dataset.csv", delimiter=",")

In [3]:
# Set username as index
user_movie_df.set_index("user", inplace=True)

In [4]:
user_movie_df

Unnamed: 0_level_0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u1,5.0,,,,,,,,,,...,,,,,,,,,,
u10,5.0,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,5.0,,,,,,,,,,...,,,,,,,,,,
u1001,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u995,,,,,,,,,,,...,,,,,,,,,,
u996,4.0,,,,,,,,,,...,,,,,,,,,,3.0
u997,4.0,,,,,,,,,,...,,,,,,,,,,
u998,,,,,,,,,,,...,,,,,,,,,,


### Save movie IDs for later use

In [5]:
movie_IDs = user_movie_df.columns
movie_IDs

Index(['m1', 'm10', 'm100', 'm1000', 'm1002', 'm1003', 'm1004', 'm1005',
       'm1006', 'm1007',
       ...
       'm99', 'm990', 'm991', 'm992', 'm993', 'm994', 'm996', 'm997', 'm998',
       'm999'],
      dtype='object', length=3706)

In [6]:
# Write movie IDs to file
pd.DataFrame({"movie_ID": movie_IDs}).to_csv("Data/movie_IDs.csv", index=False)

### Generate full movie/movie similarity matrix

In [7]:
# Convert the user/movie ratings dataframe to a matrix
user_movie_matrix = user_movie_df.to_numpy()

In [8]:
user_movie_matrix

array([[ 5., nan, nan, ..., nan, nan, nan],
       [ 5., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [ 4., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [9]:
# Center each row to handle tough and easy reviewers. Ignore missing values.
user_movie_matrix = user_movie_matrix - np.nanmean(user_movie_matrix, axis=1)[:, np.newaxis]

In [10]:
user_movie_matrix

array([[0.81132075,        nan,        nan, ...,        nan,        nan,
               nan],
       [0.88528678,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       ...,
       [0.06666667,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan],
       [       nan,        nan,        nan, ...,        nan,        nan,
               nan]])

In [11]:
# Identify non-NaN ratings.
rating_not_na = np.logical_not(np.isnan(user_movie_matrix))

In [12]:
rating_not_na

array([[ True, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [ True, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [13]:
# Save the number of movies in the dataset
n_movies = user_movie_matrix.shape[1]

In [14]:
# Generate similarity matrix using cosine measure.

# Initialize all entries in similarity matrix to NaN.
movie_S = np.full((n_movies, n_movies), fill_value=np.NAN, dtype="float64")

# The minimum shared ratings for a pair of movies to be included in the similarity matrix
min_shared_ratings = 3

In [15]:
# For each pair of movies, compute the similarity-matrix value based only on
# the users who rated both movies. There must be at least 3 shared users per pair of movies.

# Loop over pairs of movies.
for i in range(n_movies-1):
    for j in range(i+1, n_movies):

        # Indexes of users who rated both movies of pair (i, j)
        rating_pair_indexes = np.logical_and(rating_not_na[:, i], rating_not_na[:, j])

        # Count the users who rated both movies.
        num_rating_pairs = np.sum(rating_pair_indexes)

        # If movies i and j have at least 3 pairs of ratings, compute their similarity.
        if (num_rating_pairs >= min_shared_ratings):
            # Get ratings left by the users who rated both
            movie_i_shared_ratings = user_movie_matrix[rating_pair_indexes, i]
            movie_j_shared_ratings = user_movie_matrix[rating_pair_indexes, j]
            
            # Compute cosine similarity components
            cosine_similarity_numerator = np.dot(movie_i_shared_ratings, movie_j_shared_ratings)
            cosine_similarity_denominator = np.sqrt(np.dot(movie_i_shared_ratings, movie_i_shared_ratings)) * \
                    np.sqrt(np.dot(movie_j_shared_ratings, movie_j_shared_ratings))
            
            # Store the similarity in S, the similarity matrix.
            movie_S[i, j] = 0.5 + (0.5 * cosine_similarity_numerator / cosine_similarity_denominator)
            # Make sure the similarity matrix is symmetric
            movie_S[j, i] = movie_S[i, j]

# Show the full movie/movie similarity matrix
movie_S                         

array([[       nan, 0.51210553, 0.39199995, ..., 0.5140432 , 0.38377183,
        0.41450545],
       [0.51210553,        nan, 0.54745829, ..., 0.66873273, 0.44828951,
        0.60081163],
       [0.39199995, 0.54745829,        nan, ..., 0.26957569, 0.47892265,
        0.6128149 ],
       ...,
       [0.5140432 , 0.66873273, 0.26957569, ...,        nan, 0.64263547,
        0.4606457 ],
       [0.38377183, 0.44828951, 0.47892265, ..., 0.64263547,        nan,
        0.64272702],
       [0.41450545, 0.60081163, 0.6128149 , ..., 0.4606457 , 0.64272702,
               nan]])

### For each row, find the 30 highest similarity values. Set the rest to NaN.

In [16]:
# First identify how many non-NaN similarity values each movie has.
non_nan_counts = np.sum(np.logical_not(np.isnan(movie_S)), axis=1)

In [17]:
# Initialize matrix of NaN values, same shape as the similarity matrix.
# Will fill these with top 30 similarity scores for each movie.
highest_S = np.full((n_movies, n_movies), fill_value=np.NAN, dtype="float64")

In [18]:
# The maximum similarity scores to retain for each movie.
max_sim_scores = 30

In [19]:
# Loop over rows of similarity matrix.

for i in range(movie_S.shape[0]):
    # Only clip the row if it has more than the max allowed non-NaN similarity scores
    if (non_nan_counts[i] > max_sim_scores):
        # Get row of similarity scores
        movie_S_i = movie_S[i, :]
        # Get indexes of the highest 30 similarity scores in the movie_S row
        movie_S_i_indexes_of_highest = np.sort(np.argsort(np.nan_to_num(movie_S_i, nan=0.0))[-max_sim_scores:])
        # Copy the 30 highest similarity values to highest_S
        highest_S[i, movie_S_i_indexes_of_highest] = movie_S[i, movie_S_i_indexes_of_highest]
    # If movie has 30 or less similarity scores, just copy them to highest_S
    else:
        highest_S[i, :] = movie_S[i, :]

# Show the culled similarity matrix
highest_S

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

#### Check results. How many non-NaN similarities are there for each row?

In [20]:
row_sim_counts = np.sum(np.logical_not(np.isnan(highest_S)), axis=1)

In [21]:
sim_counts_df = pd.DataFrame({"sim_count": row_sim_counts})

In [22]:
sim_counts_df.groupby(["sim_count"])["sim_count"].count()

sim_count
0      207
1        3
2        1
3        2
4        1
5        1
6        4
8        1
11       1
12       2
14       1
16       1
17       1
20       1
21       1
28       1
29       3
30    3474
Name: sim_count, dtype: int64

### Save the culled similarity matrix to file

#### Save as dense matrix in CSV format

In [23]:
# Save the processed similarity matrix
highest_S_df = pd.DataFrame(highest_S, index=movie_IDs, columns=movie_IDs)
highest_S_df

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,,,,,,,,,,,...,,,,,,,,,,
m10,,,,,,,,,,,...,,,,,,,,,,
m100,,,,,,,,,,,...,,,,,,,,,,
m1000,,,,,,,,,,,...,,,,,,,,,,
m1002,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,,,,,,,,,,,...,,,,,,,,,,
m996,,,,,,,,,,,...,,,,,,,,,,
m997,,,,,,,,,,,...,,,,,,,,,,
m998,,,,,,,,,,,...,,,,,,,,,,


In [24]:
highest_S_df.to_csv("Data/movie_similarity_matrix.csv", na_rep="", header=True, index=True)

#### Alternatively, save as sparse matrix in NPZ format

In [25]:
from scipy import sparse

In [26]:
# Convert the culled similarity matrix to sprase format
highest_S_sparse = sparse.csr_matrix(highest_S)

In [27]:
# Save to file
sparse.save_npz("Data/movie_similarity_matrix.npz", highest_S_sparse)

#### Sample code to read in the sparse matrix and re-inflate it

In [28]:
# Read in the sparse matrix
highest_S_sparse_read_back = sparse.load_npz("Data/movie_similarity_matrix.npz")

In [29]:
# Convert sparse matrix to dense
highest_S_sparse_read_back_dense = highest_S_sparse_read_back.toarray()

In [30]:
# Read in movie IDs
movie_IDs_read_back = pd.read_csv("Data/movie_IDs.csv")["movie_ID"].tolist()

In [31]:
# Convert dense matrix to dataframe, indexed by movie names
highest_S_read_back_df = pd.DataFrame(highest_S_sparse_read_back_dense, index=movie_IDs_read_back, columns=movie_IDs_read_back)

In [32]:
highest_S_read_back_df

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,,,,,,,,,,,...,,,,,,,,,,
m10,,,,,,,,,,,...,,,,,,,,,,
m100,,,,,,,,,,,...,,,,,,,,,,
m1000,,,,,,,,,,,...,,,,,,,,,,
m1002,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,,,,,,,,,,,...,,,,,,,,,,
m996,,,,,,,,,,,...,,,,,,,,,,
m997,,,,,,,,,,,...,,,,,,,,,,
m998,,,,,,,,,,,...,,,,,,,,,,


In [33]:
# Check that the re-inflated dataframe is the same as the original
highest_S_read_back_df.compare(highest_S_df)