In [110]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
from numpy.linalg import svd

from random import normalvariate
from math import sqrt

In [111]:
# Read data from the CSV file into a Pandas DataFrame
df = pd.read_csv("./SVD/ml-latest-small/ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [112]:
movie_ratings = df.groupby(['userId', 'movieId'])['rating'].first().unstack(fill_value=0.0)
movie_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
movie_ratings.shape

(610, 9724)

In [119]:
# Read data from the CSV file into a Pandas DataFrame
movies = pd.read_csv("./SVD/ml-latest-small/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [120]:
movies.shape

(9742, 3)

In [114]:
def randomUnitVector(n):
    unnormalized = [normalvariate(0, 1) for _ in range(n)]
    theNorm = sqrt(sum(x * x for x in unnormalized))
    return [x / theNorm for x in unnormalized]


def svd_1d(A, epsilon=1e-10):
    ''' The one-dimensional SVD '''

    n, m = A.shape
    x = randomUnitVector(min(n,m))
    lastV = None
    currentV = x

    if n > m:
        B = np.dot(A.T, A)
    else:
        B = np.dot(A, A.T)

    iterations = 0
    while True:
        iterations += 1
        lastV = currentV
        currentV = np.dot(B, lastV)
        currentV = currentV / norm(currentV)

        if abs(np.dot(currentV, lastV)) > 1 - epsilon:
            print("converged in {} iterations!".format(iterations))
            return currentV


def svd2(A, k=None, epsilon=1e-10):
    '''
        Compute the singular value decomposition of a matrix A
        using the power method. A is the input matrix, and k
        is the number of singular values you wish to compute.
        If k is None, this computes the full-rank decomposition.
    '''
    A = np.array(A, dtype=float)
    n, m = A.shape
    svdSoFar = []
    if k is None:
        k = min(n, m)

    for i in range(k):
        print(i, k)
        matrixFor1D = A.copy()

        for singularValue, u, v in svdSoFar[:i]:
            matrixFor1D -= singularValue * np.outer(u, v)

        if n > m:
            v = svd_1d(matrixFor1D, epsilon=epsilon)  # next singular vector
            u_unnormalized = np.dot(A, v)
            sigma = norm(u_unnormalized)  # next singular value
            u = u_unnormalized / sigma
        else:
            u = svd_1d(matrixFor1D, epsilon=epsilon)  # next singular vector
            v_unnormalized = np.dot(A.T, u)
            sigma = norm(v_unnormalized)  # next singular value
            v = v_unnormalized / sigma

        svdSoFar.append((sigma, u, v))

    singularValues, us, vs = [np.array(x) for x in zip(*svdSoFar)]
    return us.T, singularValues, vs

In [115]:
U, S, Vt = svd(movie_ratings, full_matrices=False)
print(f'U:\n {U}\n')
print(f'S:\n {S}\n')
print(f'Vh:\n {Vt}\n')

U:
 [[-5.55541517e-02  6.16738477e-02 -1.08974491e-02 ...  3.01873377e-03
  -2.89230819e-04  4.31423480e-04]
 [-5.86629527e-03 -1.77377186e-02 -4.42345417e-03 ... -3.26516243e-03
  -8.86828015e-03  8.86330337e-04]
 [-1.35323055e-03  2.06861278e-03  1.71517331e-03 ... -2.29472620e-03
  -2.14465926e-03 -1.61906293e-03]
 ...
 [-1.16114423e-01  1.18470415e-02 -9.76290702e-03 ...  8.84377554e-04
  -7.65989186e-04  1.37856876e-03]
 [-7.57943540e-03  1.37846340e-02 -3.97412421e-02 ...  1.21896501e-02
  -3.21030684e-03 -1.75729437e-02]
 [-1.38864880e-01 -2.02184449e-01  9.26753579e-02 ... -7.72222925e-05
  -3.38393669e-04  7.05394146e-04]]

S:
 [534.41989777 231.23661142 191.1508762  170.42250831 154.552948
 147.33575651 135.65556768 122.66302989 121.44217651 113.11144323
 109.60313933 107.93266172 105.97376877 102.05675293  99.87323589
  99.28999246  97.11713355  93.40879296  92.32408574  90.97607986
  90.42515264  88.83466993  87.29627026  86.05702164  85.15393734
  83.04476272  82.40743887 

In [100]:
U, S, Vt = svd2(movie_ratings, k=50)
print(f'U:\n {U}\n {U.shape}\n')
print(f'S:\n {S}\n {S.shape}\n')
print(f'Vh:\n {Vt}\n {Vt.shape}\n')

0 50
converged in 8 iterations!
1 50
converged in 26 iterations!
2 50
converged in 26 iterations!
3 50
converged in 51 iterations!
4 50
converged in 90 iterations!
5 50
converged in 72 iterations!
6 50
converged in 49 iterations!
7 50
converged in 396 iterations!
8 50
converged in 59 iterations!
9 50
converged in 136 iterations!
10 50
converged in 246 iterations!
11 50
converged in 237 iterations!
12 50
converged in 80 iterations!
13 50
converged in 225 iterations!
14 50
converged in 667 iterations!
15 50
converged in 201 iterations!
16 50
converged in 90 iterations!
17 50
converged in 316 iterations!
18 50
converged in 234 iterations!
19 50
converged in 492 iterations!
20 50
converged in 209 iterations!
21 50
converged in 210 iterations!
22 50
converged in 247 iterations!
23 50
converged in 559 iterations!
24 50
converged in 164 iterations!
25 50
converged in 163 iterations!
26 50
converged in 437 iterations!
27 50
converged in 269 iterations!
28 50
converged in 221 iterations!
29 50


In [116]:
sigma_diag_matrix=np.diag(S)
all_user_predicted_ratings = np.dot(np.dot(U, sigma_diag_matrix), Vt)
#all_user_predicted_ratings_demeaned = all_user_predicted_ratings +  user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = user_item.columns, index=user_item.index)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,1.306941e-14,4.0,2.782496e-15,7.4038e-15,4.0,1.316308e-14,1.158795e-15,2.584738e-15,2.525757e-14,...,-1.847481e-16,3.686287e-18,1.73906e-16,1.73906e-16,-1.847481e-16,1.73906e-16,-1.847481e-16,-1.847481e-16,-1.847481e-16,1.823628e-16
2,2.329387e-14,7.077672e-16,-3.941292e-15,1.960238e-15,-2.498002e-16,-1.279532e-14,4.024558e-15,-1.471046e-15,-4.659467e-15,-1.9304e-14,...,3.157197e-16,2.94903e-16,4.649059e-16,4.649059e-16,3.157197e-16,4.649059e-16,3.157197e-16,3.157197e-16,3.157197e-16,-1.474515e-16
3,3.21531e-15,-3.534608e-15,-2.123302e-15,-1.214306e-15,-1.054712e-15,2.581269e-15,-6.696033e-15,-9.020562e-16,9.714451e-16,1.512679e-15,...,-1.162265e-16,-7.806256e-17,-8.326673e-17,-8.326673e-17,-1.162265e-16,-8.326673e-17,-1.162265e-16,-1.162265e-16,-1.162265e-16,1.446759e-15
4,-1.105366e-14,6.232861e-15,7.452372e-15,1.526557e-16,7.521761e-15,-4.371503e-16,6.05592e-15,6.232211e-15,1.061651e-15,9.679757e-15,...,5.421011e-16,3.495468e-16,7.676151e-16,7.676151e-16,5.421011e-16,7.676151e-16,5.421011e-16,5.421011e-16,5.421011e-16,1.12757e-15
5,4.0,1.44329e-15,-5.634382e-15,-3.580469e-15,-5.842549e-15,2.053913e-15,-9.436896e-15,-2.303713e-15,1.609823e-15,-2.275957e-15,...,-2.428613e-16,-1.700029e-16,-2.810252e-16,-2.810252e-16,-2.428613e-16,-2.810252e-16,-2.428613e-16,-2.428613e-16,-2.428613e-16,1.419004e-15


In [138]:
def get_high_recommended_movies(userId):
    movies_rated_by_user = movie_ratings.loc[userId]
    movies_high_rated_by_user =  movies_rated_by_user[movies_rated_by_user > 3].index
    movies_recommended_for_user = preds_df.loc[userId]
    movies_high_recommend_for_user = movies_recommended_for_user[movies_recommended_for_user > 3].index
    recommend = set(movies_high_recommend_for_user) - set(movies_high_rated_by_user)
    recommend_name = set()
    for movie_id in recommend:
        recommend_name.add(str(movies.loc[df['movieId'] == movie_id]['title']).split('\n')[0])
    return recommend_name

get_high_recommended_movies(200)

{'1209    Air Force One (1997)',
 '1261    Starship Troopers (1997)',
 '1522    Honey, I Shrunk the Kids (1989)',
 '1960    Election (1999)',
 '3016    Traffic (2000)',
 "3410    America's Sweethearts (2001)",
 '3720    Bad News Bears, The (1976)',
 '3928    Blue Crush (2002)',
 '436    Mrs. Doubtfire (1993)',
 '4526    Matchstick Men (2003)',
 '472    Sleepless in Seattle (1993)',
 "55    Mr. Holland's Opus (1995)",
 '5944    Bad News Bears (2005)',
 '785    Mary Poppins (1964)'}