<a href="https://colab.research.google.com/github/KevinYih/BigDataDemo/blob/main/SVD_Netflix_movie_ratings_by_Kevin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#SVD - Netflix movie ratings

Adapted from: https://colab.research.google.com/github/ashishdasari148/Recommender-Systems-using-Collaborative-Filtering/blob/master/Netflix.ipynb#scrollTo=qX7Nk4tUGZND

In [None]:
#!echo '{"username":"YOUR_USERNAME","key":"YOUR_API_KEY"}' > /root/.kaggle/kaggle.json
#!chmod 600 /root/.kaggle/kaggle.json
!pip install kaggle
!kaggle datasets download -d netflix-inc/netflix-prize-data

In [None]:
import os
import zipfile
with zipfile.ZipFile('netflix-prize-data.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [None]:
import pandas as pd

import re
import os
import time
import matplotlib.pyplot as plt
import numpy as np
dir_name = 'data'

In [None]:
# Converting the txt files into csv files
files = ['combined_data_1.txt','combined_data_2.txt','combined_data_3.txt','combined_data_4.txt']
for j in files:
    file1 = open(os.path.join(dir_name, j),"r+")
    print(j + " opened...")
    file2 = open(os.path.join(dir_name, j + '.csv'),"a")
    for i in file1.readlines():
        temp = re.match("(\d*)[:]", i)
        if(temp):
            movienumber = temp.group(1)
        else:
            file2.write(str(movienumber)+","+i)
    file1.close()
    file2.close()
    print(j + " closed...")

In [None]:
movie_titles = pd.read_csv(os.path.join(dir_name, 'movie_titles.csv'), encoding = 'ISO-8859-1', header = None, names=['Movie', 'Year of Release', 'Title'], usecols=[0, 1, 2])
movie_titles

In [None]:
## appending all the csv files of rating data into a single dataframe
dfs = []
for i in ['combined_data_1.txt.csv','combined_data_2.txt.csv','combined_data_3.txt.csv','combined_data_4.txt.csv']:
    df_ = pd.read_csv(os.path.join(dir_name, i), header=None, names=['Movie', 'User', 'Rating', 'Date'], usecols=[0, 1, 2, 3])
    df_ = df_[['Movie','User','Rating']]
    dfs.append(df_)
df_all = pd.concat(dfs, ignore_index=True)

In [None]:
print("number of ratings:", len(df_all))

In [None]:
# search for your favorite movie

indices = movie_titles.Title.str.contains('matrix', flags=re.IGNORECASE)
movie_titles[indices]
# 14691    The Matrix
# 3925     The Matrix: Reloaded
# 16767    The Matrix: Revisited
# 468      The Matrix: Revolutions
# 397      A Night in Casablanca
# 10661    Tokyo Story
# 963      A Streetcar Named Desire
# 6099     Apocalypse Now
# 4253    Kind Hearts and Coronets
# 2412     Harry Potter and the Prisoner of Azkaban: Bonu...
# 11443    Harry Potter and the Chamber of Secrets



In [None]:
# choosing random movies
#movie_ids = set(df_raw.sample(frac=.00001).Movie.unique())

# or choosing your favorite
movie_ids = set({14691, 3925, 16767, 468, 397, 10661, 963, 6099, 4253, 2412, 11443,13082, 17339 })


In [None]:
df_all[df_all.Movie == 17339 ]

In [None]:
# saving the merged file to drive for further use.
#df_all.to_csv(path_or_buf=os.path.join(dir_name, 'data.txt.csv'),index=False)

In [None]:
# loading the saved csv into dataframe
#df_all = pd.read_csv(os.path.join(dir_name, 'data.txt.csv'))

In [None]:
print("number of ratings:", len(df_all))

In [None]:
df_movie_ids = df_all[df_all.Movie.isin(movie_ids)]
print("Unique movie ids in the dataframe:", (df_movie_ids.Movie.unique()))
print("Number of unique user ids:", len(df_movie_ids.User.unique()))

In [None]:
df_ = df_movie_ids.sample(frac=0.001)
len(df_)

In [None]:
# create a dataframe that only contains rating for a subset of users and
# a subset of movies
user_ids = set(df_.User.unique())
df_small = df_all[(df_all.User.isin(user_ids) & df_all.Movie.isin(movie_ids))]
print("Unique movie ids in the dataframe:", (df_small.Movie.unique()))
print("Number of unique user ids:", len(df_small.User.unique()))
print("Number of ratings:", len(df_small))

In [None]:
itom = list(sorted(movie_ids))
mtoi = {m: i for i, m in enumerate(itom)}
itou = list(sorted(user_ids))
utoi = {u: i for i, u in enumerate(itou)}

In [None]:
user_movie_rating = np.zeros((len(itou), len(itom)))

for index, row in df_small.iterrows():
  movie_id = row.Movie
  movie_index = mtoi[movie_id]

  user_id = row.User
  user_index = utoi[user_id]

  user_movie_rating[user_index, movie_index] = row.Rating

In [None]:
user_movie_rating.shape

In [None]:
# more efficient way of building a user-movie rating matrix
um = df_small.pivot_table(index='User', columns='Movie', values='Rating').values
um.shape

In [None]:
user_movie_rating[:10,]

In [None]:
# compute the SVD of the matrix
A = user_movie_rating
U, S, Vt = np.linalg.svd(A, full_matrices=False)

In [None]:
print(U.shape, S.shape, Vt.shape)

In [None]:
A_hat = U @ np.diag(S) @ Vt

In [None]:
np.set_printoptions(suppress=True, formatter={'float': '{:0.2f}'.format})
print(A_hat[:10,])

In [None]:
sum([a*a for a in S[:3]])/sum([a*a for a in S])

0.8748565522564679

In [None]:
r = 8
A_hat_r = U[:,:r] @ np.diag(S[:r]) @ Vt[:r,:]
np.set_printoptions(suppress=True, formatter={'float': '{:0.2f}'.format})
print(A_hat_r[:10,])