<center><img src="img/logo_hse_black.jpg"></center>

<h1><center>Data Analysis</center></h1>
<h2><center>Seminar: Recsys </center></h2>

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 6)

# Data preprocessing

1. Download [data](https://cloud.mail.ru/public/CSjR/mPctRVc2u) with ratings and movies
2. Examine ids of movies and users
3. Examine documentation of scipy.sparse.coo_matrix
3. Encode ids in a proper way to ease creation of rating matrix


The goal is to create a matrix of user ratings for all movies:

<center><img src="img/rating.png"></center>

### Data reading

In [None]:
! wget https://raw.githubusercontent.com/hushchyn-mikhail/hse_se_ml/s15/2020/s15-recsys/data/user_ratedmovies.dat

In [None]:
! wget https://raw.githubusercontent.com/hushchyn-mikhail/hse_se_ml/s15/2020/s15-recsys/data/movies.dat

In [2]:
# read user ratings

df_ratings = pd.read_csv('data/user_ratedmovies.dat', sep='\t')
df_ratings.head()

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,75,3,1.0,29,10,2006,23,17,16
1,75,32,4.5,29,10,2006,23,23,44
2,75,110,4.0,29,10,2006,23,30,8
3,75,160,2.0,29,10,2006,23,16,52
4,75,163,4.0,29,10,2006,23,29,30


In [3]:
# read movie descriptions

df_movies = pd.read_csv('data/movies.dat', sep='\t', encoding = "ISO-8859-1")
df_movies.head()

Unnamed: 0,id,title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtAllCriticsRating,rtAllCriticsNumReviews,rtAllCriticsNumFresh,...,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumReviews,rtTopCriticsNumFresh,rtTopCriticsNumRotten,rtTopCriticsScore,rtAudienceRating,rtAudienceNumRatings,rtAudienceScore,rtPictureURL
0,1,Toy story,114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995,toy_story,9.0,73,73,...,100,8.5,17,17,0,100,3.7,102338,81,http://content7.flixster.com/movie/10/93/63/10...
1,2,Jumanji,113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995,1068044-jumanji,5.6,28,13,...,46,5.8,5,2,3,40,3.2,44587,61,http://content8.flixster.com/movie/56/79/73/56...
2,3,Grumpy Old Men,107050,Dos viejos gruñones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993,grumpy_old_men,5.9,36,24,...,66,7.0,6,5,1,83,3.2,10489,66,http://content6.flixster.com/movie/25/60/25602...
3,4,Waiting to Exhale,114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995,waiting_to_exhale,5.6,25,14,...,56,5.5,11,5,6,45,3.3,5666,79,http://content9.flixster.com/movie/10/94/17/10...
4,5,Father of the Bride Part II,113041,Vuelve el padre de la novia (Ahora también abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995,father_of_the_bride_part_ii,5.3,19,9,...,47,5.4,5,1,4,20,3.0,13761,64,http://content8.flixster.com/movie/25/54/25542...


In [4]:
# data info

print("Number of users: ", df_ratings.userID.nunique())
print("Number of movies with user ratings: ", df_ratings.movieID.nunique())
print("Number of movies: ", df_movies.loc[:, 'id'].nunique())

Number of users:  2113
Number of movies with user ratings:  10109
Number of movies:  10197


### One-hot encoding userID and movieID

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
# userID one-hot encoding

print("Before: ", df_ratings.userID.values[:20])

# fit encoder
enc_user = LabelEncoder()
enc_user.fit(df_ratings.userID.values)

# apply encoder
df_ratings.loc[:, 'userID'] = enc_user.transform(df_ratings.loc[:, 'userID'].values)
print("After: ", df_ratings.userID.values[:20])

Before:  [75 75 75 75 75 75 75 75 75 75 75 75 75 75 75 75 75 75 75 75]
After:  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [7]:
# movieID one-hot encoding

print("Before: ", df_movies.loc[:, 'id'].values[:20])

# fit encoder
enc_movies = LabelEncoder()
enc_movies.fit(df_movies.loc[:, 'id'].values)

# apply encoder
df_ratings.loc[:, 'movieID'] = enc_movies.transform(df_ratings.loc[:, 'movieID'].values)
df_movies.loc[:, 'id'] = enc_movies.transform(df_movies.loc[:, 'id'].values)
print("After: ", df_movies.loc[:, 'id'].values[:20])

Before:  [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
After:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


### Create a user reatings matrix

In [8]:
from scipy.sparse import coo_matrix

In [9]:
n_movies = enc_movies.classes_.shape[0]
n_users  = enc_user.classes_.shape[0]

print("Number of users:  ", n_users)
print("Number of movies: ", n_movies)

Number of users:   2113
Number of movies:  10197


In [10]:
# create the matrix

R = coo_matrix((df_ratings.rating.values,  # ratings
               (df_ratings.userID.values, df_ratings.movieID.values))) # userID and moviesID

In [11]:
print("Size (n_users, n_movies): ", R.shape)
print("Number of non-zeros ratings: ", R.nnz)

Size (n_users, n_movies):  (2113, 10197)
Number of non-zeros ratings:  855598


In [12]:
print("Ratings of the first 10 users for the first 10 movies:")

R.toarray()[:10, :10]

Ratings of the first 10 users for the first 10 movies:


array([[0. , 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [3. , 2. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5],
       [4. , 0. , 0. , 0. , 0. , 5. , 0. , 0. , 0. , 0. ],
       [4.5, 4. , 0. , 0. , 0. , 0. , 2. , 0. , 0. , 0. ],
       [2.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3. ],
       [4. , 0. , 0. , 0. , 3. , 0. , 3.5, 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 3. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])

# SVD on Rating matrix

The goal is to decompoze matrix R (X) using SVD:
<center><img src="img/svd.png"></center>


where
- 𝑈 - unitary matrix, which contains eigenvectors of $𝑋𝑋^⊤$
- 𝑉 - unitary matrix, which contains eigenvectors of $𝑋^⊤𝑋$
- Σ - diagonal matrix with singular values $\sigma_i = \sqrt{\lambda_i}$

Interpretation:
<center><img src="img/matrix_factorization.png"></center>

where
- $R$ - user ratings
- $P = U \Sigma^{1/2}$ - its components sometimes can be iterpreted as user's interests in topic 
- $Q = \Sigma^{1/2}V^T$ - its components sometimes can be iterpreted as movies' relevance to topic

1. Find latent representation of movies with scipy.sparse.linalg.svds
2. For each movie find 10 nearest neigbours in that feature space

### SVD

In [13]:
from scipy.sparse.linalg import svds

In [14]:
# apply SVD to R matrix, keeping only the first 10 main components (topics)

U, S, V_T = svds(R, k=10)
V = V_T.T

print("U shape: ", U.shape)
print("S shape: ", S.shape)
print("V_T shape: ", V_T.shape)

U shape:  (2113, 10)
S shape:  (10,)
V_T shape:  (10, 10197)


In [15]:
# S is diagonal matrix ant it save like np.array
S

array([ 232.86920461,  261.70125609,  300.14199845,  315.32085391,
        348.46928961,  388.72357327,  395.35976356,  493.67832495,
        581.54459324, 1861.07048692])

### Search for the nearest movies in the topics space

In [16]:
from sklearn.neighbors import NearestNeighbors

In [17]:
# calculate distances between movies

nn = NearestNeighbors(n_neighbors=11, metric='cosine', n_jobs=-1)
nn.fit(V)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=11, p=2,
                 radius=1.0)

In [18]:
# find nearest 10 (11 - 1) neighbors for each movie

ind = nn.kneighbors(V, n_neighbors=11, return_distance=False)

print("Shape: ", ind.shape)

Shape:  (10197, 11)


In [19]:
# take movie titles
movie_title = df_movies.title.values

# create a data frame with 10 nearest movies for each movie
df_nn_movies = pd.DataFrame(data=movie_title[ind], 
                            columns=['movie'] + ['nn{}'.format(i+1) for i in range(10)])

In [20]:
# show similar movies for cartoon Shrek
idx = df_nn_movies.movie.str.contains('Shrek')
df_nn_movies.loc[idx]

Unnamed: 0,movie,nn1,nn2,nn3,nn4,nn5,nn6,nn7,nn8,nn9,nn10
4007,Shrek,Finding Nemo,Pirates of the Caribbean: The Curse of the Bla...,"Monsters, Inc.",The Lord of the Rings: The Two Towers,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings: The Return of the King,Men in Black,Spider-Man,Star Wars: Episode I - The Phantom Menace,Shrek 2
7437,Shrek 2,"Monsters, Inc.",Finding Nemo,Harry Potter and the Chamber of Secrets,Harry Potter and the Prisoner of Azkaban,Harry Potter and the Sorcerer's Stone,Harry Potter and the Goblet of Fire,Pirates of the Caribbean: The Curse of the Bla...,Shrek,Ice Age,"The Chronicles of Narnia: The Lion, the Witch ..."
9507,Shrek the Third,Happy Feet,Sydney White,The Wild,Alvin and the Chipmunks,Going the Distance,The Spiderwick Chronicles,Journey to the Center of the Earth,Open Season,I Know Who Killed Me,Monster House
10146,Shrek the Halls,Witless Protection,The Big Squeeze,Just a Little Harmless Sex,I Confess,Hush,White Lightning,The Chase,The Longest Yard,FM,Road to Rio


# User-based CF

<center><img src="img/ub-collab.png"></center>

* Split data to train and test in proportion to 80/20
* Implement similarity function
* Implement User-based CF based on $K$ most similar users. How does MAE changes with $K$ in range [5-25]
* Repeat this process with normalized ratings

In [21]:
from scipy.spatial.distance import cosine
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from scipy.spatial.distance import correlation
from sklearn.metrics import pairwise_distances

#### Split train/test using datetime

In [22]:
# define datetime

df_ratings.columns = ['userID', 'movieID', 'rating', 'day', 'month', 'year', 'hour', 'minute', 'second']
df_ratings.loc[:, 'datetime']  = pd.to_datetime(df_ratings.loc[:, ['day', 'month', 'year']])
df_ratings.head()

Unnamed: 0,userID,movieID,rating,day,month,year,hour,minute,second,datetime
0,0,2,1.0,29,10,2006,23,17,16,2006-10-29
1,0,31,4.5,29,10,2006,23,23,44,2006-10-29
2,0,105,4.0,29,10,2006,23,30,8,2006-10-29
3,0,151,2.0,29,10,2006,23,16,52,2006-10-29
4,0,154,4.0,29,10,2006,23,29,30,2006-10-29


In [23]:
# split data

q = df_ratings.datetime.quantile(.8)

# train
idx = df_ratings.datetime <= q
df_ratings_train = df_ratings.loc[idx]

# test
idx = df_ratings.datetime > q
df_ratings_test = df_ratings.loc[idx]

print("Train size: ", df_ratings_train.shape)
print("Test size:  ", df_ratings_test.shape)

Train size:  (684810, 10)
Test size:   (170788, 10)


In [24]:
# user ratings matrix

R_train = coo_matrix((df_ratings_train.rating.values, 
                     (df_ratings_train.userID.values, df_ratings_train.movieID.values)),
                     shape=(n_users, n_movies))

R_train = R_train.toarray() # to numpy array

print("R shape: ", R_train.shape)

R shape:  (2113, 10197)


### Predict ratings

<center><img src="img/algo.png" width="500"></center>

In [25]:
# define similarity score between two users

def my_metric(u, v):
    #find idx where u not eq 0 and v not eq 0
    idx = (u != 0) & (v != 0)
    if len(u[idx]) > 2:
        sim_score = 2 - cosine(u[idx], v[idx])
    else:
        sim_score = 0
    return sim_score

In [26]:
# compute similarities score between each pair of users using custom my_metric

sim = pdist(R_train, metric=my_metric)
print("Shape: ", sim.shape)

Shape:  (2231328,)


In [27]:
Sim = squareform(sim)
print("Shape: ", Sim.shape)

Shape:  (2113, 2113)


In [28]:
df_ratings_train.head()

Unnamed: 0,userID,movieID,rating,day,month,year,hour,minute,second,datetime
0,0,2,1.0,29,10,2006,23,17,16,2006-10-29
1,0,31,4.5,29,10,2006,23,23,44,2006-10-29
2,0,105,4.0,29,10,2006,23,30,8,2006-10-29
3,0,151,2.0,29,10,2006,23,16,52,2006-10-29
4,0,154,4.0,29,10,2006,23,29,30,2006-10-29


In [29]:
df_ratings_test.head()

Unnamed: 0,userID,movieID,rating,day,month,year,hour,minute,second,datetime
925,5,164,1.0,30,7,2008,18,59,11,2008-07-30
936,5,354,3.0,30,7,2008,18,33,3,2008-07-30
937,5,355,3.0,30,7,2008,18,38,38,2008-07-30
938,5,367,2.5,27,8,2008,4,26,37,2008-08-27
939,5,461,4.5,16,4,2008,19,58,20,2008-04-16


In [30]:
predicted_ratings = []

# for each user-movie pair ...
for i, r in tqdm_notebook(df_ratings_test.iterrows()):
    
    watched_users = np.where(R_train[:, int(r['movieID'])])[0]
    sim = Sim[int(r['userID']), watched_users]
    sorted_idx = np.argsort(sim)
    
    # for k most similar users ...
    for k in range(5, 25):
        
        ratings = R_train[watched_users[sorted_idx[-k:]], int(r['movieID'])]
        
        sim_k = sim[sorted_idx[-k:]]
        
        prediction = ratings.dot(sim_k)/(sim_k.sum()) 

        predicted_ratings.append({'userID': r['userID'],
                                  'movieID': r['movieID'],
                                  'prediction': prediction,
                                  'k': k})

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))






In [31]:
# create data frame with predictions

df_predicted_ratings = pd.DataFrame(predicted_ratings)
df_predicted_ratings = \
    df_predicted_ratings.join(df_ratings_test.loc[:, ['movieID', 'userID', 'rating']].set_index(['movieID', 'userID']),
                             on=['movieID', 'userID'])

df_predicted_ratings.head()

Unnamed: 0,userID,movieID,prediction,k,rating
0,5,164,2.496698,5,1.0
1,5,164,2.580224,6,1.0
2,5,164,2.639898,7,1.0
3,5,164,2.622487,8,1.0
4,5,164,2.719535,9,1.0


In [32]:
# calculate errors

df_predicted_ratings.loc[:, 'error'] = abs(df_predicted_ratings.prediction - df_predicted_ratings.rating)
df_predicted_ratings.groupby('k').error.mean()

k
5     0.730427
6     0.721938
7     0.715167
8     0.710036
9     0.706944
10    0.703943
11    0.701367
12    0.699216
13    0.697704
14    0.696110
15    0.694513
16    0.693374
17    0.692620
18    0.691364
19    0.690554
20    0.689780
21    0.688965
22    0.688411
23    0.687665
24    0.687223
Name: error, dtype: float64