In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import svd_tests as t
%matplotlib inline

# Read in the datasets
movies = pd.read_csv('./data/movies_clean.csv')
reviews = pd.read_csv('./data/reviews_clean.csv')

del movies['Unnamed: 0']
del reviews['Unnamed: 0']

# Create user-by-item matrix
user_items = reviews[['user_id', 'movie_id', 'rating', 'timestamp']]
user_by_movie = user_items.groupby(['user_id', 'movie_id'])['rating'].max().unstack()

# Create data subset
user_movie_subset = user_by_movie[[75314,  68646, 99685]].dropna(axis=0)
ratings_mat = np.matrix(user_movie_subset)
print(ratings_mat)

[[ 7. 10.  8.]
 [ 6. 10.  7.]
 [ 8.  9.  8.]
 [ 8. 10. 10.]
 [ 9.  9.  9.]
 [ 8.  9.  9.]]


## Gradient Descent

- We want to see what are correlated between the $user_i$ and $movie_j$. To know the this correlate, we can study which are
latent factors between them.
- The latent factors may be: sad, AI, Human,.. We will try to extract:
    - The `tastes` of each user for latent factors. Let's say we $user_i \rightarrow u_i$ with number of column in
  vector $u_i$ = # latent factors
    - The `tastes` of each movie for latent factors.
  $movie_j \rightarrow v_j$ with number of column in $v_j$ = # latent factors
    - Then we try to estimate the rating value from $user_i$ and $movie_j$.
    $$\hat{r_{ij}} = r(user_i, movie_j) = u_i * v_j^T$$
    - To approximate the values for $umat_i$ and $movie_j$ we will try to optimize the
    $$min(r_{ij} - \hat{r_{ij}})^2 = min(r_{ij} - u_i * v_j^T)^2$$
    - Using **Gradient Descent** we will loop each step:
      $$u_i = u_i - \frac{\delta}{\delta u_i}(r - u * v)^2 = u_i + \alpha * 2*(r - u * v)*v_i$$
      $$v_j = v_j - \frac{\delta}{\delta v_j}(r - u * v)^2 = v_j + \alpha *2*(r - u * v)*u_i$$



In [2]:


def FunkSVD(ratings_mat, latent_features=4, learning_rate=0.0001, iters=100):
    '''
    This function performs matrix factorization using a basic form of FunkSVD with no regularization

    INPUT:
    ratings_mat - (numpy array) a matrix with users as rows, movies as columns, and ratings as values
    latent_features - (int) the number of latent features used
    learning_rate - (float) the learning rate
    iters - (int) the number of iterations

    OUTPUT:
    user_mat - (numpy array) a user by latent feature matrix
    movie_mat - (numpy array) a latent feature by movie matrix
    '''

    # Set up useful values to be used through the rest of the function
    n_users = ratings_mat.shape[0]
    n_movies = ratings_mat.shape[1]
    num_ratings = np.count_nonzero(~np.isnan(ratings_mat))

    # initialize the user and movie matrices with random values
    user_mat = np.random.rand(n_users, latent_features)
    movie_mat = np.random.rand(latent_features, n_movies)

    # initialize sse at 0 for first iteration
    sse_accum = 0

    # keep track of iteration and MSE
    print("Optimizaiton Statistics")
    print("Iterations | Mean Squared Error ")

    # for each iteration
    for iteration in range(iters):

        # update our sse
        old_sse = sse_accum
        sse_accum = 0

        # For each user-movie pair
        for i in range(n_users):
            for j in range(n_movies):

                # if the rating exists
                if ratings_mat[i, j] > 0:

                    # compute the error as the actual minus the dot product of the user and movie latent features
                    diff = ratings_mat[i, j] - np.dot(user_mat[i, :], movie_mat[:, j])

                    # Keep track of the sum of squared errors for the matrix
                    sse_accum += diff**2

                    # update the values in each matrix in the direction of the gradient
                    for k in range(latent_features):
                        user_mat[i, k] += learning_rate * (2*diff*movie_mat[k, j])
                        movie_mat[k, j] += learning_rate * (2*diff*user_mat[i, k])

        # print results
        print("%d \t\t %f" % (iteration+1, sse_accum / num_ratings))

    return user_mat, movie_mat

In [3]:
# use your function with 3 latent features, lr of 0.005 and 10 iterations
user_mat, movie_mat = FunkSVD(ratings_mat, latent_features=3, learning_rate=0.005, iters=10)

print(np.dot(user_mat, movie_mat))
print(ratings_mat)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 62.699675
2 		 52.290081
3 		 37.733372
4 		 22.023048
5 		 10.056597
6 		 3.822939
7 		 1.500088
8 		 0.810574
9 		 0.623950
10 		 0.570868
[[ 7.62389971  9.24532557  8.51084406]
 [ 6.7252445   8.39202135  7.67239642]
 [ 7.34259866  9.1646816   8.34758902]
 [ 8.61717358 10.07754521  9.30051429]
 [ 7.91338603  9.54207716  8.76174181]
 [ 7.56285639  9.61416299  8.72403154]]
[[ 7. 10.  8.]
 [ 6. 10.  7.]
 [ 8.  9.  8.]
 [ 8. 10. 10.]
 [ 9.  9.  9.]
 [ 8.  9.  9.]]


In [4]:
#use your function with 3 latent features, lr of 0.005 and 300 iterations
user_mat, movie_mat = FunkSVD(ratings_mat, latent_features=3, learning_rate=0.005, iters=300)
print(np.dot(user_mat, movie_mat))
print(ratings_mat)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 58.402902
2 		 45.141741
3 		 28.931030
4 		 14.460452
5 		 5.674305
6 		 2.030330
7 		 0.911724
8 		 0.622230
9 		 0.547727
10 		 0.524051
11 		 0.512913
12 		 0.505458
13 		 0.499375
14 		 0.493905
15 		 0.488725
16 		 0.483673
17 		 0.478659
18 		 0.473629
19 		 0.468550
20 		 0.463399
21 		 0.458161
22 		 0.452824
23 		 0.447381
24 		 0.441824
25 		 0.436150
26 		 0.430354
27 		 0.424434
28 		 0.418387
29 		 0.412213
30 		 0.405911
31 		 0.399482
32 		 0.392927
33 		 0.386248
34 		 0.379449
35 		 0.372532
36 		 0.365504
37 		 0.358368
38 		 0.351132
39 		 0.343802
40 		 0.336387
41 		 0.328896
42 		 0.321338
43 		 0.313723
44 		 0.306062
45 		 0.298367
46 		 0.290651
47 		 0.282925
48 		 0.275204
49 		 0.267499
50 		 0.259825
51 		 0.252196
52 		 0.244625
53 		 0.237125
54 		 0.229710
55 		 0.222393
56 		 0.215186
57 		 0.208101
58 		 0.201150
59 		 0.194343
60 		 0.187689
61 		 0.181198
62 		 0.174877
63 		 0.168734
64 

In [5]:
ratings_mat[0, 0] = np.nan
ratings_mat

matrix([[nan, 10.,  8.],
        [ 6., 10.,  7.],
        [ 8.,  9.,  8.],
        [ 8., 10., 10.],
        [ 9.,  9.,  9.],
        [ 8.,  9.,  9.]])

In [6]:
user_mat, movie_mat = FunkSVD(ratings_mat, latent_features=3, learning_rate=0.005, iters=450)
print(np.dot(user_mat, movie_mat))
print(ratings_mat)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 61.846640
2 		 50.396794
3 		 35.380076
4 		 20.187200
5 		 9.215112
6 		 3.627950
7 		 1.485432
8 		 0.784093
9 		 0.557550
10 		 0.475754
11 		 0.440218
12 		 0.421292
13 		 0.409114
14 		 0.399935
15 		 0.392155
16 		 0.385027
17 		 0.378186
18 		 0.371444
19 		 0.364704
20 		 0.357913
21 		 0.351045
22 		 0.344087
23 		 0.337033
24 		 0.329885
25 		 0.322644
26 		 0.315317
27 		 0.307909
28 		 0.300428
29 		 0.292884
30 		 0.285286
31 		 0.277644
32 		 0.269970
33 		 0.262275
34 		 0.254571
35 		 0.246872
36 		 0.239189
37 		 0.231536
38 		 0.223926
39 		 0.216373
40 		 0.208889
41 		 0.201487
42 		 0.194180
43 		 0.186980
44 		 0.179898
45 		 0.172947
46 		 0.166137
47 		 0.159477
48 		 0.152976
49 		 0.146642
50 		 0.140483
51 		 0.134505
52 		 0.128713
53 		 0.123112
54 		 0.117704
55 		 0.112493
56 		 0.107479
57 		 0.102662
58 		 0.098044
59 		 0.093621
60 		 0.089393
61 		 0.085356
62 		 0.081507
63 		 0.077843
64 

In [8]:
# Setting up a matrix of the first 1000 users with movie ratings
first_1000_users = np.matrix(user_by_movie.head(1000))

# perform funkSVD on the matrix of the top 1000 users
#fit to 1000 users with 3 latent features, lr of 0.005, and 500 iterations
user_mat, movie_mat = FunkSVD(first_1000_users, latent_features=3, learning_rate=0.005, iters=500)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 22.923825
2 		 10.704018
3 		 7.384616
4 		 5.731548
5 		 4.720434
6 		 4.028408
7 		 3.519630
8 		 3.126840
9 		 2.812947
10 		 2.555762
11 		 2.341098
12 		 2.159379
13 		 2.003868
14 		 1.869679
15 		 1.753187
16 		 1.651620
17 		 1.562798
18 		 1.484936
19 		 1.416536
20 		 1.356312
21 		 1.303151
22 		 1.256082
23 		 1.214264
24 		 1.176972
25 		 1.143579
26 		 1.113552
27 		 1.086436
28 		 1.061845
29 		 1.039455
30 		 1.018988
31 		 1.000212
32 		 0.982928
33 		 0.966967
34 		 0.952183
35 		 0.938450
36 		 0.925659
37 		 0.913717
38 		 0.902539
39 		 0.892053
40 		 0.882195
41 		 0.872908
42 		 0.864142
43 		 0.855852
44 		 0.848000
45 		 0.840551
46 		 0.833474
47 		 0.826742
48 		 0.820330
49 		 0.814216
50 		 0.808381
51 		 0.802806
52 		 0.797475
53 		 0.792374
54 		 0.787488
55 		 0.782803
56 		 0.778309
57 		 0.773994
58 		 0.769846
59 		 0.765857
60 		 0.762016
61 		 0.758315
62 		 0.754746
63 		 0.751300
64 		