In [98]:
import numpy as np
from numpy.linalg import norm
import pandas as pd

from random import normalvariate
from math import sqrt

In [99]:
# Read data from the CSV file into a Pandas DataFrame
df = pd.read_csv("./ml-latest-small/ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [100]:
movie_ratings = df.groupby(['userId', 'movieId'])['rating'].first().unstack(fill_value=0.0)
movie_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
movie_ratings.shape

(610, 9724)

In [102]:
# Read data from the CSV file into a Pandas DataFrame
movies = pd.read_csv("./ml-latest-small/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [103]:
movies.shape

(9742, 3)

In [104]:
def randomUnitVector(n):
    unnormalized = [normalvariate(0, 1) for _ in range(n)]
    theNorm = sqrt(sum(x * x for x in unnormalized))
    return [x / theNorm for x in unnormalized]


def svd_1d(A, epsilon=1e-10):
    ''' The one-dimensional SVD '''

    n, m = A.shape
    x = randomUnitVector(min(n,m))
    lastV = None
    currentV = x

    if n > m:
        B = np.dot(A.T, A)
    else:
        B = np.dot(A, A.T)

    iterations = 0
    while True:
        iterations += 1
        lastV = currentV
        currentV = np.dot(B, lastV)
        currentV = currentV / norm(currentV)

        if abs(np.dot(currentV, lastV)) > 1 - epsilon:
            print("converged in {} iterations!".format(iterations))
            return currentV


def svd(A, k=None, epsilon=1e-10):
    '''
        Compute the singular value decomposition of a matrix A
        using the power method. A is the input matrix, and k
        is the number of singular values you wish to compute.
        If k is None, this computes the full-rank decomposition.
    '''
    A = np.array(A, dtype=float)
    n, m = A.shape
    svdSoFar = []
    if k is None:
        k = min(n, m)

    for i in range(k):
        print(i, k)
        matrixFor1D = A.copy()

        for singularValue, u, v in svdSoFar[:i]:
            matrixFor1D -= singularValue * np.outer(u, v)

        if n > m:
            v = svd_1d(matrixFor1D, epsilon=epsilon)  # next singular vector
            u_unnormalized = np.dot(A, v)
            sigma = norm(u_unnormalized)  # next singular value
            u = u_unnormalized / sigma
        else:
            u = svd_1d(matrixFor1D, epsilon=epsilon)  # next singular vector
            v_unnormalized = np.dot(A.T, u)
            sigma = norm(v_unnormalized)  # next singular value
            v = v_unnormalized / sigma

        svdSoFar.append((sigma, u, v))

    singularValues, us, vs = [np.array(x) for x in zip(*svdSoFar)]
    return us.T, singularValues, vs

In [105]:
U, S, Vt = svd(movie_ratings, k=50)
print(f'U:\n {U}\n {U.shape}\n')
print(f'S:\n {S}\n {S.shape}\n')
print(f'Vh:\n {Vt}\n {Vt.shape}\n')

0 50
converged in 10 iterations!
1 50
converged in 25 iterations!
2 50
converged in 51 iterations!
3 50
converged in 46 iterations!
4 50
converged in 90 iterations!
5 50
converged in 55 iterations!
6 50
converged in 48 iterations!
7 50
converged in 327 iterations!
8 50
converged in 65 iterations!
9 50
converged in 117 iterations!
10 50
converged in 235 iterations!
11 50
converged in 186 iterations!
12 50
converged in 143 iterations!
13 50
converged in 188 iterations!
14 50
converged in 642 iterations!
15 50
converged in 176 iterations!
16 50
converged in 97 iterations!
17 50
converged in 320 iterations!
18 50
converged in 376 iterations!
19 50
converged in 701 iterations!
20 50
converged in 207 iterations!
21 50
converged in 237 iterations!
22 50
converged in 282 iterations!
23 50
converged in 336 iterations!
24 50
converged in 192 iterations!
25 50
converged in 603 iterations!
26 50
converged in 351 iterations!
27 50
converged in 391 iterations!
28 50
converged in 249 iterations!
29 5

In [106]:
sigma_diag_matrix=np.diag(S)
all_user_predicted_ratings = np.dot(np.dot(U, sigma_diag_matrix), Vt)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = movie_ratings.columns, index=movie_ratings.index)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.181504,0.393706,0.838228,-0.08236,-0.546588,2.521744,-0.887417,-0.025276,0.196893,1.60717,...,-0.024996,-0.021425,-0.028567,-0.028567,-0.024996,-0.028567,-0.024996,-0.024996,-0.024996,-0.05896
2,0.209824,0.004826,0.030744,0.017251,0.183769,-0.060652,0.083297,0.023795,0.0481,-0.151973,...,0.018895,0.016196,0.021594,0.021594,0.018895,0.021594,0.018895,0.018895,0.018895,0.031966
3,0.013372,0.034726,0.050526,0.000201,-0.005594,0.114673,-0.007466,0.000737,0.004743,-0.061263,...,-0.001613,-0.001382,-0.001843,-0.001843,-0.001613,-0.001843,-0.001613,-0.001613,-0.001613,-0.000529
4,2.011793,-0.394921,-0.290386,0.093879,0.123146,0.25972,0.472678,0.035964,0.011265,-0.021781,...,0.001965,0.001684,0.002246,0.002246,0.001965,0.002246,0.001965,0.001965,0.001965,-0.021455
5,1.336689,0.772954,0.064579,0.11388,0.274975,0.584481,0.25104,0.131531,-0.086315,1.035386,...,-0.004408,-0.003778,-0.005038,-0.005038,-0.004408,-0.005038,-0.004408,-0.004408,-0.004408,-0.006097


In [107]:
def get_high_recommended_movies(user_id):
    movies_rated_by_user = movie_ratings.loc[user_id]
    movies_high_rated_by_user =  movies_rated_by_user[movies_rated_by_user > 3].index.tolist()
    
    movies_recommended_for_user = preds_df.loc[user_id]
    movies_high_recommend_for_user = movies_recommended_for_user[movies_recommended_for_user > 3].sort_values(ascending=False).index.tolist()
    
    recommend_id = []
    for movie_id in movies_high_recommend_for_user:
        if movie_id not in movies_high_rated_by_user:
            recommend_id.append(movie_id)
            
    recommend_name = []
    for movie_id in recommend_id:
        movie_name = ' '.join(str(movies.loc[movies['movieId'] == movie_id]['title']).split('\n')[0].split()[1:])
        recommend_name.append(movie_name)
        
    return recommend_name

In [108]:
user_id = 200
# user_id = int(input())
recommend_movies = get_high_recommended_movies(user_id)

print(f'user_id = {user_id}\n')
print('recommendation Movies: ')
for movie in recommend_movies:
    print(movie)

user_id = 200

recommendation Movies: 
Shrek (2001)
Mrs. Doubtfire (1993)
Titanic (1997)
40-Year-Old Virgin, The (2005)
Dead Poets Society (1989)
Groundhog Day (1993)
Anchorman: The Legend of Ron Burgundy (2004)
Rock, The (1996)
