In [1]:
# Import all the required libraries
import numpy as np
import pandas as pd

## Question 1

Create a ratings matrix using Numpy. This matrix allows us to see the ratings for a given movie and user ID. The element at location $[i,j]$ is a rating given by user $i$ for movie $j$. Print the **shape** of the matrix produced.  

Additionally, choose 3 users that have rated the movie with MovieID "**1377**" (Batman Returns). Print these ratings, they will be used later for comparison.


**Notes:**
- Do *not* use `pivot_table`.
- A ratings matrix is *not* the same as `ratings_data` from above.
- The ratings of movie with MovieID $i$ are stored in the ($i$-1)th column (index starts from 0)  
- Not every user has rated every movie. Missing entries should be set to 0 for now.
- If you're stuck, you might want to look into `np.zeros` and how to use it to create a matrix of the desired shape.
- Every review lies between 1 and 5, and thus fits within a `uint8` datatype, which you can specify to numpy.

In [2]:
column_list_ratings = ["UserID", "MovieID", "Ratings","Timestamp"]
ratings_data  = pd.read_csv('ratings.dat',sep='::',names = column_list_ratings, engine='python')
column_list_movies = ["MovieID","Title","Genres"]
movies_data = pd.read_csv('movies.dat',sep = '::',names = column_list_movies, engine='python', encoding = 'latin-1')
column_list_users = ["UserID","Gender","Age","Occupation","Zixp-code"]
user_data = pd.read_csv("users.dat",sep = "::",names = column_list_users, engine='python')
data=pd.merge(pd.merge(ratings_data,user_data),movies_data)
data

mean_ratings=data.pivot_table('Ratings','Title',aggfunc='mean')
mean_ratings
mean_ratings=data.pivot_table('Ratings',index=["Title"],aggfunc='mean')
top_15_mean_ratings = mean_ratings.sort_values(by = 'Ratings',ascending = False).head(15)
top_15_mean_ratings

mean_ratings=data.pivot_table('Ratings',index=["Title"],columns=["Gender"],aggfunc='mean')
mean_ratings
data=pd.merge(pd.merge(ratings_data,user_data),movies_data)

mean_ratings=data.pivot_table('Ratings',index=["Title"],columns=["Gender"],aggfunc='mean')
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
print(top_female_ratings.head(15))

top_male_ratings = mean_ratings.sort_values(by='M', ascending=False)
print(top_male_ratings.head(15))
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_by_diff = mean_ratings.sort_values(by='diff')
sorted_by_diff[:10]
ratings_by_title=data.groupby('Title').size()
ratings_by_title.sort_values(ascending=False).head(10)

Gender                                               F         M
Title                                                           
Clean Slate (Coup de Torchon) (1981)               5.0  3.857143
Ballad of Narayama, The (Narayama Bushiko) (1958)  5.0  3.428571
Raw Deal (1948)                                    5.0  3.307692
Bittersweet Motel (2000)                           5.0       NaN
Skipped Parts (2000)                               5.0  4.000000
Lamerica (1994)                                    5.0  4.666667
Gambler, The (A Játékos) (1997)                    5.0  3.166667
Brother, Can You Spare a Dime? (1975)              5.0  3.642857
Ayn Rand: A Sense of Life (1997)                   5.0  4.000000
24 7: Twenty Four Seven (1997)                     5.0  3.750000
Twice Upon a Yesterday (1998)                      5.0  3.222222
Woman of Paris, A (1923)                           5.0  2.428571
I Am Cuba (Soy Cuba/Ya Kuba) (1964)                5.0  4.750000
Gate of Heavenly Peace, T

Title
American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
Saving Private Ryan (1998)                               2653
Terminator 2: Judgment Day (1991)                        2649
Matrix, The (1999)                                       2590
Back to the Future (1985)                                2583
Silence of the Lambs, The (1991)                         2578
dtype: int64

In [3]:
# Create the matrix
import numpy as np
import pandas as pd



# Get the maximum UserID and MovieID to determine the dimensions of the ratings matrix
num_users = ratings_data['UserID'].max()
num_movies = ratings_data['MovieID'].max()

# Create a zero-initialized matrix of dimensions (num_users, num_movies)
ratings_matrix = np.zeros((num_users, num_movies), dtype=np.uint8)

# Iterate through the ratings_data DataFrame and fill in the ratings matrix
for index, row in ratings_data.iterrows():
    user_id = row['UserID']
    movie_id = row['MovieID']
    rating = row['Ratings']
    # Subtract 1 from the ids since matrix indices start from 0
    ratings_matrix[user_id - 1, movie_id - 1] = rating




In [4]:
# Print the shape
print(ratings_matrix.shape)

(6040, 3952)


In [5]:
# Store and print ratings for Batman Returns

def movieRatingSearch(ratings_matrix,movie_index):
    movie_ratings = ratings_matrix[:, movie_index]


    user_ids = np.where(movie_ratings > 0)[0][:3] 
    
    
    for user_id in user_ids:
        rating = ratings_matrix[user_id, movie_index]
        print(f"User {user_id + 1} rated the movie with MovieID '1377' a rating of {rating}")

movieRatingSearch(ratings_matrix,1376)

User 10 rated the movie with MovieID '1377' a rating of 3
User 13 rated the movie with MovieID '1377' a rating of 3
User 18 rated the movie with MovieID '1377' a rating of 2


## Question 2

Normalize the ratings matrix (created in **Question 1**) using Z-score normalization. While we can't use `sklearn`'s `StandardScaler` for this step, we can do the statistical calculations ourselves to normalize the data.

Before you start:
- Your first step should be to get the average of every *column* of the ratings matrix (we want an average by title, not by user!).
- Make sure that the mean is calculated considering only non-zero elements. If there is a movie which is rated only by 10 users, we get its mean rating using (sum of the 10 ratings)/10 and **NOT** (sum of 10 ratings)/(total number of users)
- All of the missing values in the dataset should be replaced with the average rating for the given movie. This is a complex topic, but for our case replacing empty values with the mean will make it so that the absence of a rating doesn't affect the overall average, and it provides an "expected value" which is useful for computing correlations and recommendations in later steps.
- In our matrix, 0 represents a missing rating.
- Next, we want to subtract the average from the original ratings thus allowing us to get a mean of 0 in every *column*. It may be very close but not exactly zero because of the limited precision `float`s allow.
- Lastly, divide this by the standard deviation of the *column*.

- Not every MovieID is used, leading to zero columns. This will cause a divide by zero error when normalizing the matrix. Simply replace any NaN values in your normalized matrix with 0.

In [11]:
def matrix_normalized(ratings_matrix):    
    sum_ratings = np.sum(ratings_matrix, axis=0)
    count_nonzero = np.count_nonzero(ratings_matrix, axis=0)
    count_nonzero[count_nonzero == 0] = 1
    mean_ratings = sum_ratings / count_nonzero
    ratings_matrix_with_mean = ratings_matrix.copy()
    for i in range(ratings_matrix.shape[1]):
        ratings_matrix_with_mean[:, i] = np.where(ratings_matrix[:, i] == 0, mean_ratings[i], ratings_matrix[:, i])
    std_dev = np.std(ratings_matrix_with_mean, axis=0)
    std_dev[std_dev == 0] = 1
    normalized_matrix = (ratings_matrix_with_mean - mean_ratings) / std_dev
    return np.nan_to_num(normalized_matrix)

## Question 3

We're now going to perform Singular Value Decomposition (SVD) on the normalized ratings matrix from the previous question. Perform the process using numpy, and along the way print the shapes of the $U$, $S$, and $V$ matrices you calculated.

In [12]:
# Compute the SVD of the normalised matrix
import numpy as np

# Assuming normalized_matrix is already defined from the previous part
mean_ratings=data.pivot_table('Ratings','Title',aggfunc='mean')
mean_ratings
# Perform SVD
U, S, Vt = np.linalg.svd(matrix_normalized(ratings_matrix), full_matrices=False)

# Print the shapes of the matrices
print(f'Shape of U: {U.shape}')
print(f'Shape of S: {S.shape}')
print(f'Shape of Vt: {Vt.shape}')


Shape of U: (6040, 3952)
Shape of S: (3952,)
Shape of Vt: (3952, 3952)


## Question 4

Reconstruct four rank-k rating matrix $R_k$, where $R_k = U_kS_kV_k^T$ for k = [100, 1000, 2000, 3000]. Using each of $R_k$ make predictions for the 3 users selected in Question 1, for the movie with ID 1377 (Batman Returns). Compare the original ratings with the predicted ratings.

In [None]:
import numpy as np

# Assuming U, S, Vt are already defined from the previous SVD part,
# and user_ids is the array of user IDs selected in Question 1

# Convert the singular values in S to a diagonal matrix
S_diag = np.diag(S)



def getRk(value):
    U_v = U[:, :value]
    S_v = S_diag[:value, :value]
    Vt_v = Vt[:value, :]
    return  np.dot(U_v, np.dot(S_v, Vt_v))
    
# Values of k
k_values = [100, 1000, 2000, 3000]

# Iterate over the values of k
for k in k_values:

    R_k = getRk(k)

    # Step 3: Extract the ratings for the 3 users for movie ID 1377 (column index 1376)
    predicted_ratings = R_k[user_ids, 1376]

    # Print the predicted ratings along with the original ratings
    print(f'Predicted ratings for k = {k}: {predicted_ratings}')
    original_ratings = ratings_matrix[user_ids, 1376]
    print(f'Original ratings: {original_ratings}')





# Note: Ensure that the indices user_ids and the column index 1376 are correct as per your dataset.


## Question 5

### Cosine Similarity
Cosine similarity is a metric used to measure how similar two vectors are. Mathematically, it measures the cosine of the angle between two vectors projected in a multi-dimensional space. Cosine similarity is high if the angle between two vectors is 0, and the output value ranges within $cosine(x,y) \in [0,1]$. $0$ means there is no similarity (perpendicular), where $1$ (parallel) means that both the items are 100% similar.

$$ cosine(x,y) = \frac{x^T y}{||x|| ||y||}  $$

**Based on the reconstruction rank-1000 rating matrix $R_{1000}$ and the cosine similarity,** sort the movies which are most similar. You will have a function `top_movie_similarity` which sorts data by its similarity to a movie with ID `movie_id` and returns the top $n$ items, and a second function `print_similar_movies` which prints the titles of said similar movies. Return the top 5 movies for the movie with ID `1377` (*Batman Returns*)

Note: While finding the cosine similarity, there are a few empty columns which will have a magnitude of **zero** resulting in NaN values. These should be replaced by 0, otherwise these columns will show most similarity with the given movie. 

In [None]:
# Sort the movies based on cosine similarity
def top_movie_similarity(data, movie_id, top_n=5):
    # Subtract 1 from movie_id since indices start from 0
    movie_vector = data[:, movie_id - 1]
    
    # Handle zero magnitude vectors to avoid division by zero
    norms = np.linalg.norm(data, axis=0) + 1e-10  # Adding a small value to avoid division by zero
    movie_norm = np.linalg.norm(movie_vector) + 1e-10  # Adding a small value to avoid division by zero
    
    # Compute cosine similarity using the formula
    cosine_similarities = np.dot(data.T, movie_vector) / (norms * movie_norm)
    
    # Replace NaN values with 0
    cosine_similarities = np.nan_to_num(cosine_similarities)
    
    # Get the indices of the top_n most similar movies
    # (excluding the movie itself)
    top_indices = np.argsort(cosine_similarities)[-top_n - 1:-1][::-1]
    
    return top_indices
def print_similar_movies(movie_titles, top_indices):
    print('Most Similar movies:')
    for index in top_indices:
        print(f'MovieID: {index + 1}, Title: {movie_titles[index]}')




R_1000 = getRk(1000)

movie_titles = pd.Series(movies_data['Title'].values, index=movies_data['MovieID']).to_dict()



# Print the top 5 movies for Batman Returns
movie_id = 1377
top_indices = top_movie_similarity(R_1000, movie_id)
print_similar_movies(movie_titles, top_indices)

## Question 6

### Movie Recommendations
Using the same process from Question 5, write `top_user_similarity` which sorts data by its similarity to a user with ID `user_id` and returns the top result. Then find the MovieIDs of the movies that this similar user has rated most highly, but that `user_id` has not yet seen. Find at least 5 movie recommendations for the user with ID `5954` and print their titles.

Hint: To check your results, find the genres of the movies that the user likes and compare with the genres of the recommended movies.

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
movie_titles = pd.Series(movies_data['Title'].values, index=movies_data['MovieID']).to_dict()
def top_user_similarity(data, user_id):
    # Subtract 1 from user_id as the matrix indices start from 0
    user_id -= 1
    
    # Get the vector for the specified user
    user_vector = data[user_id, :].reshape(1, -1)
    
    # Compute the cosine similarity between the user vector and all other user vectors
    similarity_matrix = cosine_similarity(user_vector, data)
    
    # Flatten the similarity matrix and replace any NaN values with 0
    similarity_scores = np.nan_to_num(similarity_matrix.flatten())
    
    # Get the index of the most similar user
    # (excluding the user itself by setting its similarity score to 0)
    similarity_scores[user_id] = 0
    similar_user_index = np.argmax(similarity_scores)
    
    return similar_user_index

def get_recommendations(data, user_id, similar_user_index, top_n=5):
    # Find movies that the similar user has rated highly but the specified user has not seen
    user_ratings = data[user_id - 1, :]
    similar_user_ratings = data[similar_user_index, :]
    unseen_indices = np.where(user_ratings == 0)[0]
    unseen_similar_user_ratings = similar_user_ratings[unseen_indices]
    top_movie_indices = unseen_indices[np.argsort(unseen_similar_user_ratings)[::-1][:top_n]]
    return top_movie_indices

# Assuming R_1000 is the reconstruction rank-1000 rating matrix

def print_Recommended_movies(user_id,data):
    similar_user_index = top_user_similarity(R_1000, user_id)
    top_movie_indices = get_recommendations(R_1000, user_id, similar_user_index)
    
    # Print the recommended movie titles
    print('Recommended movies:')
    for movie_index in top_movie_indices:
        # Add 1 to movie_index as MovieID starts from 1
        print(movie_titles[movie_index + 1])

R_1000 = getRk(1000)
print_Recommended_movies(user_id,R_1000)


