In [2]:
import pandas as pd
import numpy as np
from scipy.linalg import svd as SVD
from sklearn.metrics.pairwise import cosine_similarity

1.Loading movies and rating data


In [3]:
#load movie and ratings data 
read_opts = dict(
    sep='::',            # separator
    engine='python',     # required when the separator is a regex
    header=None,         # no header row
    encoding='ISO‑8859‑1'  # avoids UnicodeDecodeError
)
movies = pd.read_csv('movies.dat',names=['MovieID', 'Title', 'Genres'],**read_opts)
ratings = pd.read_csv('ratings.dat',names=['UserID', 'MovieID', 'Rating', 'Timestamp'],**read_opts)

In [4]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
print(f"\nLoaded {len(movies)} movies from movie.dat and {len(ratings)} ratings from ratings.dat.")


Loaded 3883 movies from movie.dat and 1000209 ratings from ratings.dat.


2.Normalized matrix of ratings data with movies as row and users as column.


 1)Movie‑by‑user matrix

- Rows = MovieID
- Columns = UserID
- Cells = Rating (1‑5)

In [7]:
rating_matrix = ratings.pivot_table(
    index='MovieID',  #rows
    columns='UserID', #columns 
    values= 'Rating',   #cells,
    fill_value = 0     #fill missing values with 0
)

In [8]:
rating_matrix

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


3706 rows × 6040 columns => number of movies rated  3706 , number of users 6040

Normalizing the rating matrix  (Movie‑by‑user matrix)

For every movie, subtract its minimum rating and divide by its rating range.
- Min-Max norm(rating)= (Rating - Min) / (Max - Min)  
- Cells now lie in [0, 1].
- Missing ratings remain NaN (if not filled by 0 ) 
- Movies where max == min (only one distinct rating) become all‑NaN after scaling.
- All NA  -> we can fill by 0 with .fillna(0)

In [9]:
row_min = rating_matrix.min(axis=1)
row_max = rating_matrix.max(axis=1)
denom   = (row_max - row_min).replace(0, pd.NA) # replace 0 with NA to avoid division by zero

In [10]:
normalized_rating_matrix = rating_matrix.sub(row_min, axis=0).div(denom, axis=0).fillna(0)

normalized_rating_matrix

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,0.8,0.0,0.8,1.0,1.0,...,0.0,0.8,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.6
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.4,0.4,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
print("Normalized matrix shape(movies, users):", normalized_rating_matrix.shape) 

Normalized matrix shape(movies, users): (3706, 6040)


3. Perform SVD to get U, S and V.

In [12]:
U, S, V = SVD(normalized_rating_matrix)

In [13]:
print("Sigma values shape :", S.shape)  #(k,)	Singular values (importance of each latent feature)
print("U matrix shape     :", U.shape)      #(movies × k) Movie features
print("V matrix shape    :", V.shape)     #(k × users)  User features

Sigma values shape : (3706,)
U matrix shape     : (3706, 3706)
V matrix shape    : (6040, 6040)


In [14]:
print("Sigma values :", S)  #(k, ) Singular values (importance of each latent feature)
print("U matrix      :", U) #(movies × k) Movie features
print("Vt matrix     :", V.T) #(k × users)  User features

Sigma values : [3.78676293e+02 1.34306978e+02 1.14986937e+02 ... 2.19396745e-14
 2.11364834e-14 9.08591563e-15]
U matrix      : [[ 7.01292699e-02 -2.08718511e-02  3.01703816e-02 ...  4.91323006e-17
   2.15096461e-19  1.26168754e-17]
 [ 2.35437422e-02 -2.98019623e-02 -1.01811025e-02 ... -3.70308992e-16
   1.47964368e-16  3.13948017e-17]
 [ 1.37663410e-02 -1.67172612e-02  1.25767352e-02 ...  5.47363776e-17
  -1.19207022e-17  7.47693166e-17]
 ...
 [ 2.61529685e-03  1.87240471e-03  1.78354990e-03 ... -4.36251812e-16
  -9.08141131e-17 -3.85183621e-17]
 [ 1.16633846e-03  2.26460117e-03  3.52141712e-03 ...  3.40488886e-16
   6.45993395e-17 -9.88361844e-17]
 [ 1.32557023e-02  5.02686103e-03  2.23587098e-02 ... -8.79238612e-18
   2.15534488e-16 -2.59054735e-17]]
Vt matrix     : [[ 4.71691270e-03  1.65196719e-03  2.67178785e-03 ... -1.58756032e-03
  -1.25134593e-02 -2.41261255e-02]
 [ 9.28820828e-03 -2.68602619e-03  3.91202759e-04 ...  1.31205017e-02
  -5.77185348e-03 -1.51057664e-03]
 [ 5.00922

U: left singular vectors → eigenvectors of AA𝑇

V: right singular vectors → eigenvectors of A𝑇A

Σ: square roots of the nonzero eigenvalues


Singular Value Decomposition of a matrix A:     A=UΣV^T 


In [15]:
eigenvalues = S**2

4. Select top 25 components from S. 

In [16]:
top_25_singular_value = np.diag(S[0:25])
top_25_eigenvectors_movies = U[:,:25]     # shape: (movies, 25)
top_25_eigenvectors_users  = V[:,:25]     # shape: (users, 25)

In [17]:
print(top_25_singular_value.shape)
print(top_25_eigenvectors_movies.shape)
print(top_25_eigenvectors_users.shape)

(25, 25)
(3706, 25)
(6040, 25)


In [18]:
print("Top 25 Singular matrix (sigma):")
print(top_25_singular_value)

Top 25 Singular matrix (sigma):
[[378.67629285   0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.        ]
 [  0.         134.30697825   0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.        ]
 [  0.           0.         114.98693683   0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.
    0.           0.           0.           0.           0.        ]
 [  0.           0.           0.         103.65940705   

5. Get the top 25 eigenvectors using eigenvalues.

In [19]:
print("Top 25 eigenvectors (movie space):")
print(top_25_eigenvectors_movies)

print("Top 25 eigenvectors (user space):")
print(top_25_eigenvectors_users)


Top 25 eigenvectors (movie space):
[[ 0.07012927 -0.02087185  0.03017038 ... -0.03191834  0.00973113
  -0.05263558]
 [ 0.02354374 -0.02980196 -0.0101811  ...  0.09272234 -0.00405696
   0.00425508]
 [ 0.01376634 -0.01671726  0.01257674 ...  0.00958803  0.0087551
   0.02656059]
 ...
 [ 0.0026153   0.0018724   0.00178355 ...  0.00226584 -0.00290046
  -0.00748729]
 [ 0.00116634  0.0022646   0.00352142 ...  0.00498655  0.00099356
  -0.00813316]
 [ 0.0132557   0.00502686  0.02235871 ...  0.01458245  0.02383491
  -0.01084252]]
Top 25 eigenvectors (user space):
[[ 0.00471691  0.00928821  0.00500922 ...  0.01767976  0.01191866
   0.00668732]
 [ 0.00165197 -0.00268603 -0.0033328  ... -0.00689869  0.00648275
  -0.01322101]
 [ 0.00267179  0.0003912  -0.00334102 ... -0.01612847  0.00589884
  -0.00522083]
 ...
 [-0.00158756  0.0131205  -0.0061135  ...  0.00228807 -0.01328452
  -0.01106998]
 [-0.01251346 -0.00577185 -0.01001015 ... -0.00937424 -0.01573741
  -0.00214891]
 [-0.02412613 -0.00151058  0.0

In [20]:
movies_25d = np.dot(top_25_eigenvectors_movies, top_25_singular_value)

movies_25d #(num of movies, 25)

array([[26.55629194, -2.80323525,  3.46919976, ..., -1.27662229,
         0.38336613, -2.02571879],
       [ 8.91545703, -4.0026115 , -1.17069379, ...,  3.70857053,
        -0.15982742,  0.1637598 ],
       [ 5.21298699, -2.24524484,  1.44616026, ...,  0.3834879 ,
         0.34491467,  1.02220361],
       ...,
       [ 0.99035092,  0.25147702,  0.20508494, ...,  0.09062589,
        -0.11426588, -0.28815369],
       [ 0.44166472,  0.30415174,  0.40491697, ...,  0.1994445 ,
         0.03914214, -0.3130105 ],
       [ 5.01962019,  0.67514252,  2.57095955, ...,  0.58324719,
         0.9389966 , -0.41728228]])

Using cosine similarity, find 5 closest movies of the movie ID 2025 using the 25
components from SVD. 

cosine similarity = dot/norm = U.V/||u||.||v||

In [21]:
target_movie_id = 2025  # given
movie_vector = movies_25d[target_movie_id, :]
movie_vector

array([ 0.19371195,  0.13159014, -0.05524148,  0.16846071, -0.14996159,
       -0.02363218,  0.20828024, -0.06771647,  0.01249884,  0.0944314 ,
        0.01147624,  0.01863196,  0.01274176, -0.09550632, -0.02964077,
        0.11757068, -0.00910323, -0.10520358,  0.18037412,  0.00126461,
       -0.16155034,  0.09745479, -0.03365893, -0.17330761, -0.03055152])

6. Using cosine similarity, find 5 closest movies of the movie ID 2025 using the 25
components from SVD.

In [22]:

# Find cosine similarity between movie 2025 and all movies
similarities = cosine_similarity(movie_vector.reshape(1, -1), movies_25d)

# Flatten similarity 
similarities = similarities.flatten()

# Set self-similarity to very small so it doesn’t get picked
similarities[target_movie_id] = -1


In [23]:
# Get indices of top 5 movies
top5_indices = similarities.argsort()[-5:][::-1]

print("Top 5 similar movies to movie 2025:", top5_indices)


Top 5 similar movies to movie 2025: [2037 3289  901 3574 3284]


In [24]:
top5_movie_ids = top5_indices

In [25]:
# Build the output DataFrame
output = pd.DataFrame({
    'Index': range(1, 6),
    'MovieID': top5_movie_ids,
    'Title': movies.loc[movies['MovieID'].isin(top5_movie_ids), 'Title'].values,
 #   'Genres': movies.loc[movies['MovieID'].isin(top5_movie_ids), 'Genres'].values,
    'Similarity': similarities[top5_movie_ids]
})

# Sort by Similarity descending, if needed
output = output.sort_values(by='Similarity', ascending=False)

In [26]:
# Print final table
print(f"Recommendated movies similar to movie ID {target_movie_id} based on cosine similarity \n",output.to_string(index=False))

Recommendated movies similar to movie ID 2025 based on cosine similarity 
  Index  MovieID                                        Title  Similarity
     1     2037                            Funny Face (1957)    0.922041
     2     3289                            Candleshoe (1977)    0.915784
     3      901                  They Might Be Giants (1971)    0.903328
     4     3574 Not One Less (Yi ge dou bu neng shao) (1999)    0.903053
     5     3284           Carnosaur 3: Primal Species (1996)    0.901310
