# Movie Rating

## Task1 : Map Reduce (movie_id, unique_users)

In [1]:
import pandas as pd

# Step 1: Load the Dataset
ratings_df = pd.read_csv('./data/ratings.csv')

# Display the first few rows of the DataFrame to verify loading
print(ratings_df.head())


   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [2]:

# Step 2: Map Phase
# For Map phase, we extract relevant columns: 'movieId' and 'userId'
map_df = ratings_df[['movieId', 'userId']]

# Display the first few rows of the map_df to verify extraction
print(map_df.head())


   movieId  userId
0        1       1
1        3       1
2        6       1
3       47       1
4       50       1


In [3]:

# Step 3: Reduce Phase
# Group by 'movieId' and count unique 'userId'
reduce_df = map_df.groupby('movieId')['userId'].nunique().reset_index()

# Rename columns to match the desired output
reduce_df.columns = ['movieId', 'unique_users']

# Display the first few rows of the reduce_df to verify results
print(reduce_df.head())


   movieId  unique_users
0        1           215
1        2           110
2        3            52
3        4             7
4        5            49


In [4]:

# Step 4: Output the Results
# Save the results to a CSV file
reduce_df.to_csv('./result/task_1_unique_users_per_movie.csv', index=False)

# Optionally, display the entire result
print(reduce_df)


      movieId  unique_users
0           1           215
1           2           110
2           3            52
3           4             7
4           5            49
...       ...           ...
9719   193581             1
9720   193583             1
9721   193585             1
9722   193587             1
9723   193609             1

[9724 rows x 2 columns]


## Task2 : MapReduce  top-rated movies (movie_id, average_rating)

In [5]:
import pandas as pd

# Step 1: Load the Dataset
ratings_df = pd.read_csv('./data/ratings.csv')

# Display the first few rows of the DataFrame to verify loading
print(ratings_df.head())


   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [6]:

# Step 2: Map Phase
# For Map phase, we extract relevant columns: 'movieId' and 'rating'
map_df = ratings_df[['movieId', 'rating']]

# Display the first few rows of the map_df to verify extraction
print(map_df.head())


   movieId  rating
0        1     4.0
1        3     4.0
2        6     4.0
3       47     5.0
4       50     5.0


In [7]:

# Step 3: Reduce Phase
# Group by 'movieId' and calculate the average rating
reduce_df = map_df.groupby('movieId')['rating'].mean().reset_index()

# Rename columns to match the desired output
reduce_df.columns = ['movieId', 'average_rating']


In [8]:

# Step 4: Sort and Output the Results
# Sort by 'average_rating' in descending order
sorted_df = reduce_df.sort_values(by='average_rating', ascending=False)

# Display the first few rows of the sorted_df to verify results
print(sorted_df.head())


      movieId  average_rating
7638    88448             5.0
8089   100556             5.0
9065   143031             5.0
9076   143511             5.0
9078   143559             5.0


In [9]:

# Save the results to a CSV file
sorted_df.to_csv('./result/task_2_top_rated_movies.csv', index=False)

# Optionally, display the entire result
print(sorted_df)


      movieId  average_rating
7638    88448             5.0
8089   100556             5.0
9065   143031             5.0
9076   143511             5.0
9078   143559             5.0
...       ...             ...
9253   157172             0.5
7536    85334             0.5
6486    53453             0.5
5200     8494             0.5
7145    71810             0.5

[9724 rows x 2 columns]


## Task3: similar Movie MinHash Algorithm

In [10]:
! pip install datasketch



In [11]:
import pandas as pd
import numpy as np
from datasketch import MinHash, MinHashLSH


In [12]:

# Step 1: Load the Dataset
ratings_df = pd.read_csv('./data/ratings.csv')

# Display the first few rows of the DataFrame to verify loading
print(ratings_df.head())


   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [13]:

# Step 2: Create User-Movie Sets
# Create a dictionary where key is movieId and value is the set of userIds who rated the movie
movie_user_dict = ratings_df.groupby('movieId')['userId'].apply(set).to_dict()


In [14]:

# Step 3: Generate MinHash Signatures
# Define the number of permutations
num_permutations = 128

# Create a dictionary to store MinHash signatures for each movie
minhash_dict = {}

for movie_id, users in movie_user_dict.items():
    m = MinHash(num_perm=num_permutations)
    for user in users:
        m.update(str(user).encode('utf8'))
    minhash_dict[movie_id] = m


In [15]:

# Step 4: Create MinHash LSH
# Define LSH with an appropriate threshold
lsh = MinHashLSH(threshold=0.5, num_perm=num_permutations)

# Add MinHash signatures to LSH
for movie_id, m in minhash_dict.items():
    lsh.insert(movie_id, m)


In [16]:

# Step 5: Find Similar Movies
def find_similar_movies(movie_id, num_results=10):
    if movie_id not in minhash_dict:
        print(f"Movie ID {movie_id} not found in dataset.")
        return []
    
    result = lsh.query(minhash_dict[movie_id])
    result.remove(movie_id)  # Remove the given movie itself from the results
    return result[:num_results]

# Example: Find movies similar to a given movie ID
given_movie_id = 1  # Replace with the movieId you are interested in
similar_movies = find_similar_movies(given_movie_id, num_results=10)

# Print similar movies
print(f"Movies similar to movie ID {given_movie_id}: {similar_movies}")


Movies similar to movie ID 1: [260, 5349, 356, 296, 3114, 1291, 1196, 780, 590, 589]


## Task4 : Find closest

In [17]:
import pandas as pd
from datasketch import MinHash, MinHashLSH


In [18]:

# Step 1: Load the Dataset
ratings_df = pd.read_csv('./data/ratings.csv')

# Display the first few rows of the DataFrame to verify loading
print(ratings_df.head())


   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [19]:

# Step 2: Create User-Movie Sets
# Create a dictionary where key is movieId and value is the set of userIds who rated the movie
movie_user_dict = ratings_df.groupby('movieId')['userId'].apply(set).to_dict()



In [20]:
# Step 3: Generate MinHash Signatures
# Define the number of permutations
num_permutations = 128

# Create a dictionary to store MinHash signatures for each movie
minhash_dict = {}

for movie_id, users in movie_user_dict.items():
    m = MinHash(num_perm=num_permutations)
    for user in users:
        m.update(str(user).encode('utf8'))
    minhash_dict[movie_id] = m


In [21]:

# Step 4: Create MinHash LSH
# Define LSH with an appropriate threshold
lsh = MinHashLSH(threshold=0.5, num_perm=num_permutations)

# Add MinHash signatures to LSH
for movie_id, m in minhash_dict.items():
    lsh.insert(movie_id, m)


In [22]:

# Step 5: Find Similar Movies for Multiple Movies
def find_similar_movies(movie_ids, num_results=10):
    similar_movies_dict = {}
    for movie_id in movie_ids:
        if movie_id not in minhash_dict:
            print(f"Movie ID {movie_id} not found in dataset.")
            continue
        result = lsh.query(minhash_dict[movie_id])
        result.remove(movie_id)  # Remove the given movie itself from the results
        similar_movies_dict[movie_id] = result[:num_results]
    return similar_movies_dict

# Example: Find movies similar to five arbitrary movie IDs
given_movie_ids = [1, 2, 3, 4, 5]  # Replace with the movieIds you are interested in
similar_movies = find_similar_movies(given_movie_ids, num_results=10)

# Print similar movies
for movie_id, similar in similar_movies.items():
    print(f"Movies similar to movie ID {movie_id}: {similar}")


Movies similar to movie ID 1: [260, 5349, 356, 296, 3114, 1291, 1196, 780, 590, 589]
Movies similar to movie ID 2: [3489, 610, 420, 6, 2054, 364, 3052, 367, 344]
Movies similar to movie ID 3: []
Movies similar to movie ID 4: []
Movies similar to movie ID 5: [637, 3988, 1367]
