In [195]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sns

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [196]:
# Read in data
ratings=pd.read_csv('C:\\Users\\DELL 5540\\Desktop\\ass\\ratings.csv')

# Take a look at the data
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [197]:
# Get the dataset information
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [198]:
# Number of users
print('The ratings dataset has', ratings['userId'].nunique(), 'unique users')

# Number of movies
print('The ratings dataset has', ratings['movieId'].nunique(), 'unique movies')

# Number of ratings
print('The ratings dataset has', ratings['rating'].nunique(), 'unique ratings')

# List of unique ratings
print('The unique ratings are', sorted(ratings['rating'].unique()))

The ratings dataset has 610 unique users
The ratings dataset has 9724 unique movies
The ratings dataset has 10 unique ratings
The unique ratings are [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]


In [199]:
# Read in data
movies = pd.read_csv('C:\\Users\\DELL 5540\\Desktop\\ass\\movies.csv')

# Take a look at the data
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [200]:
# Merge ratings and movies datasets
df = pd.merge(ratings, movies, on='movieId', how='inner')

# Take a look at the data
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [201]:
# Aggregate by movie to calculate mean rating and number of ratings
agg_ratings = df.groupby('title').agg(mean_rating=('rating', 'mean'),
                                      number_of_ratings=('rating', 'count')).reset_index()

# Keep the movies with over 100 ratings
agg_ratings_GT100 = agg_ratings[agg_ratings['number_of_ratings'] > 100]
agg_ratings_GT100.info()

<class 'pandas.core.frame.DataFrame'>
Index: 134 entries, 74 to 9615
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              134 non-null    object 
 1   mean_rating        134 non-null    float64
 2   number_of_ratings  134 non-null    int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 4.2+ KB


In [202]:
# Check popular movies
print(agg_ratings_GT100.sort_values(by='number_of_ratings', ascending=False).head())

                                 title  mean_rating  number_of_ratings
3158               Forrest Gump (1994)     4.164134                329
7593  Shawshank Redemption, The (1994)     4.429022                317
6865               Pulp Fiction (1994)     4.197068                307
7680  Silence of the Lambs, The (1991)     4.161290                279
5512                Matrix, The (1999)     4.192446                278


In [203]:
# Merge data
df_GT100 = pd.merge(df, agg_ratings_GT100[['title']], on='title', how='inner')
df_GT100.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19788 entries, 0 to 19787
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     19788 non-null  int64  
 1   movieId    19788 non-null  int64  
 2   rating     19788 non-null  float64
 3   timestamp  19788 non-null  int64  
 4   title      19788 non-null  object 
 5   genres     19788 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 927.7+ KB


In [204]:
# Number of users
print('The ratings dataset has', df_GT100['userId'].nunique(), 'unique users')

# Number of movies
print('The ratings dataset has', df_GT100['movieId'].nunique(), 'unique movies')

# Number of ratings
print('The ratings dataset has', df_GT100['rating'].nunique(), 'unique ratings')

# List of unique ratings
print('The unique ratings are', sorted(df_GT100['rating'].unique()))

The ratings dataset has 597 unique users
The ratings dataset has 134 unique movies
The ratings dataset has 10 unique ratings
The unique ratings are [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]


In [205]:
# Create user-item matrix
matrix = df_GT100.pivot_table(index='userId', columns='title', values='rating')

# Fill NaN values with zero
matrix = matrix.fillna(0)

# Display the first few rows of the updated matrix
matrix.head()


title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,True Lies (1994),"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),Up (2009),"Usual Suspects, The (1995)",WALL·E (2008),Waterworld (1995),Willy Wonka & the Chocolate Factory (1971),X-Men (2000)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,4.0,0.0,0.0,5.0,5.0,0.0,4.0,...,0.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,5.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
5,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0


In [206]:
# Convert the matrix values from float to integer
matrix = matrix.astype(int)

# Display the first few rows of the updated matrix
matrix.head()


title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,True Lies (1994),"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),Up (2009),"Usual Suspects, The (1995)",WALL·E (2008),Waterworld (1995),Willy Wonka & the Chocolate Factory (1971),X-Men (2000)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,4,0,0,5,5,0,4,...,0,0,0,3,0,5,0,0,5,5
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,4,0,0,0,5,0,0,0,...,0,0,2,0,0,0,0,0,4,0
5,0,3,4,0,0,0,0,0,0,0,...,2,0,0,0,0,4,0,0,0,0


In [207]:
# Normalize user-item matrix
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis='rows')

matrix_norm.head()

title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,True Lies (1994),"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),Up (2009),"Usual Suspects, The (1995)",WALL·E (2008),Waterworld (1995),Willy Wonka & the Chocolate Factory (1971),X-Men (2000)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.835821,-1.835821,-1.835821,2.164179,-1.835821,-1.835821,3.164179,3.164179,-1.835821,2.164179,...,-1.835821,-1.835821,-1.835821,1.164179,-1.835821,3.164179,-1.835821,-1.835821,3.164179,3.164179
2,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493,...,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493,-0.201493
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.858209,-0.858209,3.141791,-0.858209,-0.858209,-0.858209,4.141791,-0.858209,-0.858209,-0.858209,...,-0.858209,-0.858209,1.141791,-0.858209,-0.858209,-0.858209,-0.858209,-0.858209,3.141791,-0.858209
5,-0.671642,2.328358,3.328358,-0.671642,-0.671642,-0.671642,-0.671642,-0.671642,-0.671642,-0.671642,...,1.328358,-0.671642,-0.671642,-0.671642,-0.671642,3.328358,-0.671642,-0.671642,-0.671642,-0.671642


In [208]:
# User similarity matrix using Pearson correlation
user_similarity = matrix_norm.T.corr()

# Fill NaN values with zero
user_similarity = user_similarity.fillna(0)

user_similarity.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,-0.11531,0.0,0.300366,0.065591,-0.02201,0.014485,-0.034252,0.036715,-0.171402,...,-0.10834,0.067758,0.357561,-0.15302,0.01108,0.139821,0.300586,0.148472,-0.004133,0.166543
2,-0.11531,1.0,0.0,-0.102366,-0.055553,-0.103067,-0.119415,-0.057469,-0.050342,0.157658,...,0.226987,-0.101786,-0.114512,-0.11952,-0.126276,-0.014809,-0.106838,-0.134661,-0.014861,0.067283
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.300366,-0.102366,0.0,1.0,-0.000772,-0.061323,0.069278,-0.111057,-0.042833,-0.096604,...,-0.075363,-0.027989,0.323436,-0.093433,0.052885,-0.007797,0.116084,0.118515,-0.135418,0.093889
5,0.065591,-0.055553,0.0,-0.000772,1.0,0.510299,-0.050868,0.51305,-0.101608,-0.028335,...,-0.093165,0.50746,-0.024537,0.377905,0.23394,-0.179117,0.151878,-0.009341,0.359096,-0.188422


In [209]:
# User similarity matrix using cosine similarity
user_similarity_cosine = cosine_similarity(matrix_norm.fillna(0))
user_similarity_cosine

array([[ 1.        , -0.11531035,  0.        , ...,  0.14847163,
        -0.00413316,  0.16654301],
       [-0.11531035,  1.        ,  0.        , ..., -0.13466097,
        -0.01486064,  0.06728279],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.14847163, -0.13466097,  0.        , ...,  1.        ,
        -0.06990143,  0.26522226],
       [-0.00413316, -0.01486064,  0.        , ..., -0.06990143,
         1.        , -0.20399043],
       [ 0.16654301,  0.06728279,  0.        , ...,  0.26522226,
        -0.20399043,  1.        ]])

In [210]:
# Pick a user ID
picked_userid = 1

# Remove picked user ID from the candidate list
user_similarity.drop(index=picked_userid, inplace=True)

# Take a look at the data
user_similarity.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-0.11531,1.0,0.0,-0.102366,-0.055553,-0.103067,-0.119415,-0.057469,-0.050342,0.157658,...,0.226987,-0.101786,-0.114512,-0.11952,-0.126276,-0.014809,-0.106838,-0.134661,-0.014861,0.067283
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.300366,-0.102366,0.0,1.0,-0.000772,-0.061323,0.069278,-0.111057,-0.042833,-0.096604,...,-0.075363,-0.027989,0.323436,-0.093433,0.052885,-0.007797,0.116084,0.118515,-0.135418,0.093889
5,0.065591,-0.055553,0.0,-0.000772,1.0,0.510299,-0.050868,0.51305,-0.101608,-0.028335,...,-0.093165,0.50746,-0.024537,0.377905,0.23394,-0.179117,0.151878,-0.009341,0.359096,-0.188422
6,-0.02201,-0.103067,0.0,-0.061323,0.510299,1.0,-0.089225,0.647324,-0.156126,-0.090694,...,-0.329619,0.744929,-0.225232,0.483299,0.149067,-0.406263,0.15799,-0.227254,0.487803,-0.415132


In [211]:
# Number of similar users
n = 10

# User similarity threashold
user_similarity_threshold = 0.3

# Get top n similar users
similar_users = user_similarity[user_similarity[picked_userid]>user_similarity_threshold][picked_userid].sort_values(ascending=False)[:n]

# Print out top n similar users
print(f'The similar users for user {picked_userid} are', similar_users)

The similar users for user 1 are userId
266    0.531156
469    0.472864
555    0.440040
452    0.431003
313    0.416069
217    0.409116
577    0.403548
368    0.397934
202    0.389325
437    0.383807
Name: 1, dtype: float64


In [212]:
# Number of similar users to display
num_similar_users = 5
num_items = 5

# Sort the similarity values of user 1 in descending order, excluding user 1
similar_users = user_similarity[1].sort_values(ascending=False).index[1:num_similar_users+1]

# Filter the matrix to include only the similar users and the first num_items items (columns)
similar_users_matrix = matrix.loc[similar_users, :].iloc[:, :num_items]

# Print the user-item matrix of similar users to user 1
print(f'The user-item matrix for the top {num_similar_users} similar users to user 1 with {num_items} items is:')
similar_users_matrix.head()


The user-item matrix for the top 5 similar users to user 1 with 5 items is:


title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
469,5,0,2,5,4
555,0,4,0,5,5
452,3,0,0,4,5
313,3,0,0,4,5
217,0,3,0,3,3


In [213]:
# Normalize user-item matrix
similar_users_matrix_norm = similar_users_matrix.subtract(similar_users_matrix.mean(axis=1), axis='rows')

similar_users_matrix_norm.head()

title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
469,1.8,-3.2,-1.2,1.8,0.8
555,-2.8,1.2,-2.8,2.2,2.2
452,0.6,-2.4,-2.4,1.6,2.6
313,0.6,-2.4,-2.4,1.6,2.6
217,-1.8,1.2,-1.8,1.2,1.2


In [214]:
# User similarity matrix using Pearson correlation
user_similarity = similar_users_matrix_norm.T.corr()
user_similarity.head()

userId,469,555,452,313,217
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
469,1.0,0.00891,0.831498,0.831498,-0.126323
555,0.00891,1.0,0.478266,0.478266,0.987484
452,0.831498,0.478266,1.0,1.0,0.356873
313,0.831498,0.478266,1.0,1.0,0.356873
217,-0.126323,0.987484,0.356873,0.356873,1.0


In [215]:
# User similarity matrix using cosine similarity
user_similarity_cosine = cosine_similarity(similar_users_matrix_norm.fillna(0))
user_similarity_cosine

array([[ 1.        ,  0.00891012,  0.83149805,  0.83149805, -0.12632279],
       [ 0.00891012,  1.        ,  0.47826602,  0.47826602,  0.98748386],
       [ 0.83149805,  0.47826602,  1.        ,  1.        ,  0.35687321],
       [ 0.83149805,  0.47826602,  1.        ,  1.        ,  0.35687321],
       [-0.12632279,  0.98748386,  0.35687321,  0.35687321,  1.        ]])